40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
276 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
283 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
284 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
285 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
288 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
289 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
290 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
294 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
295 MVT::v3i16, MVT::v4i16, MVT::Other},
300 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
316 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
317 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
318 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
319 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
320 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
321 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
322 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
323 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
355 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
369 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
383 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
397 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
411 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
426 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
435 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
436 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
441 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
445 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
446 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
447 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
448 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
525 {MVT::f32, MVT::f64},
Legal);
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
745 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
799 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
857 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
858 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
862 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
863 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
864 MVT::i16, MVT::i8, MVT::i128},
868 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
869 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
962 EVT DestVT,
EVT SrcVT)
const {
972 LLT DestTy,
LLT SrcTy)
const {
973 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
974 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1000 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1002 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1029 return (NumElts + 1) / 2;
1035 return NumElts * ((
Size + 31) / 32);
1044 EVT VT,
EVT &IntermediateVT,
1045 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1054 if (ScalarVT == MVT::bf16) {
1055 RegisterVT = MVT::i32;
1056 IntermediateVT = MVT::v2bf16;
1058 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1059 IntermediateVT = RegisterVT;
1061 NumIntermediates = (NumElts + 1) / 2;
1062 return NumIntermediates;
1067 IntermediateVT = RegisterVT;
1068 NumIntermediates = NumElts;
1069 return NumIntermediates;
1072 if (Size < 16 && Subtarget->has16BitInsts()) {
1074 RegisterVT = MVT::i16;
1075 IntermediateVT = ScalarVT;
1076 NumIntermediates = NumElts;
1077 return NumIntermediates;
1082 RegisterVT = MVT::i32;
1083 IntermediateVT = ScalarVT;
1084 NumIntermediates = NumElts;
1085 return NumIntermediates;
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = RegisterVT;
1091 NumIntermediates = NumElts * ((
Size + 31) / 32);
1092 return NumIntermediates;
1097 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1101 assert(MaxNumLanes != 0);
1103 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1104 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1115 auto *ST = dyn_cast<StructType>(Ty);
1120 assert(ST->getNumContainedTypes() == 2 &&
1121 ST->getContainedType(1)->isIntegerTy(32));
1136 DL.getPointerSizeInBits(AS) == 192)
1146 DL.getPointerSizeInBits(AS) == 160) ||
1148 DL.getPointerSizeInBits(AS) == 192))
1156 unsigned IntrID)
const {
1158 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1172 if (RsrcIntr->IsImage)
1176 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1183 Info.ptrVal = RsrcArg;
1191 unsigned MaxNumLanes = 4;
1193 if (RsrcIntr->IsImage) {
1217 if (RsrcIntr->IsImage) {
1218 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1239 case Intrinsic::amdgcn_raw_buffer_load_lds:
1240 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1241 case Intrinsic::amdgcn_struct_buffer_load_lds:
1242 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1243 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1254 case Intrinsic::amdgcn_ds_ordered_add:
1255 case Intrinsic::amdgcn_ds_ordered_swap:
1256 case Intrinsic::amdgcn_ds_fadd:
1257 case Intrinsic::amdgcn_ds_fmin:
1258 case Intrinsic::amdgcn_ds_fmax: {
1271 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1279 if (!Vol || !Vol->
isZero())
1284 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1285 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1288 Info.ptrVal =
nullptr;
1293 case Intrinsic::amdgcn_ds_append:
1294 case Intrinsic::amdgcn_ds_consume: {
1307 case Intrinsic::amdgcn_global_atomic_csub: {
1317 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1327 case Intrinsic::amdgcn_global_atomic_fadd:
1328 case Intrinsic::amdgcn_global_atomic_fmin:
1329 case Intrinsic::amdgcn_global_atomic_fmax:
1330 case Intrinsic::amdgcn_global_atomic_fmin_num:
1331 case Intrinsic::amdgcn_global_atomic_fmax_num:
1332 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1333 case Intrinsic::amdgcn_flat_atomic_fadd:
1334 case Intrinsic::amdgcn_flat_atomic_fmin:
1335 case Intrinsic::amdgcn_flat_atomic_fmax:
1336 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1337 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1338 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1339 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1340 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1351 case Intrinsic::amdgcn_global_load_tr: {
1359 case Intrinsic::amdgcn_ds_gws_init:
1360 case Intrinsic::amdgcn_ds_gws_barrier:
1361 case Intrinsic::amdgcn_ds_gws_sema_v:
1362 case Intrinsic::amdgcn_ds_gws_sema_br:
1363 case Intrinsic::amdgcn_ds_gws_sema_p:
1364 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1374 Info.memVT = MVT::i32;
1378 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1384 case Intrinsic::amdgcn_global_load_lds: {
1386 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1392 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1402 Info.memVT = MVT::i32;
1416 Type *&AccessTy)
const {
1418 case Intrinsic::amdgcn_global_load_tr:
1419 case Intrinsic::amdgcn_ds_ordered_add:
1420 case Intrinsic::amdgcn_ds_ordered_swap:
1421 case Intrinsic::amdgcn_ds_append:
1422 case Intrinsic::amdgcn_ds_consume:
1423 case Intrinsic::amdgcn_ds_fadd:
1424 case Intrinsic::amdgcn_ds_fmin:
1425 case Intrinsic::amdgcn_ds_fmax:
1426 case Intrinsic::amdgcn_global_atomic_fadd:
1427 case Intrinsic::amdgcn_flat_atomic_fadd:
1428 case Intrinsic::amdgcn_flat_atomic_fmin:
1429 case Intrinsic::amdgcn_flat_atomic_fmax:
1430 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1431 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1432 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1433 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1434 case Intrinsic::amdgcn_global_atomic_csub: {
1445bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM,
1451 return AM.BaseOffs == 0 && AM.Scale == 0;
1454 return AM.Scale == 0 &&
1456 AM.BaseOffs, AddrSpace, FlatVariant));
1478 return isLegalMUBUFAddressingMode(AM);
1481bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1492 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1504 if (AM.HasBaseReg) {
1535 return isLegalMUBUFAddressingMode(AM);
1542 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1582 : isLegalMUBUFAddressingMode(AM);
1630 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1644 Alignment < RequiredAlignment)
1665 RequiredAlignment =
Align(4);
1683 *IsFast = (Alignment >= RequiredAlignment) ? 64
1684 : (Alignment <
Align(4)) ? 32
1706 *IsFast = (Alignment >= RequiredAlignment) ? 96
1707 : (Alignment <
Align(4)) ? 32
1720 RequiredAlignment =
Align(8);
1731 *IsFast = (Alignment >= RequiredAlignment) ? 128
1732 : (Alignment <
Align(4)) ? 32
1749 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1751 return Alignment >= RequiredAlignment ||
1756 bool AlignedBy4 = Alignment >=
Align(4);
1758 *IsFast = AlignedBy4;
1760 return AlignedBy4 ||
1770 bool AlignedBy4 = Alignment >=
Align(4);
1772 *IsFast = AlignedBy4;
1783 return Alignment >=
Align(4) ||
1797 return Size >= 32 && Alignment >=
Align(4);
1802 unsigned *IsFast)
const {
1804 Alignment, Flags, IsFast);
1814 if (
Op.size() >= 16 &&
1818 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1826 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1836 unsigned DestAS)
const {
1844 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1848 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1868 unsigned Index)
const {
1915 std::tie(InputPtrReg, RC, ArgTy) =
1925 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1931 const SDLoc &SL)
const {
1938 const SDLoc &SL)
const {
1941 std::optional<uint32_t> KnownSize =
1943 if (KnownSize.has_value())
1970 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1979SDValue SITargetLowering::lowerKernargMemParameter(
1991 int64_t OffsetDiff =
Offset - AlignDownOffset;
1997 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2007 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2018 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2065 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2093 Reg = &WorkGroupIDX;
2094 RC = &AMDGPU::SReg_32RegClass;
2098 Reg = &WorkGroupIDY;
2099 RC = &AMDGPU::SReg_32RegClass;
2103 Reg = &WorkGroupIDZ;
2104 RC = &AMDGPU::SReg_32RegClass;
2135 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2139 "vector type argument should have been split");
2144 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2153 "unexpected vector split in ps argument type");
2167 Info->markPSInputAllocated(PSInputNum);
2169 Info->markPSInputEnabled(PSInputNum);
2186 if (
Info.hasWorkItemIDX()) {
2188 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2192 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2196 if (
Info.hasWorkItemIDY()) {
2202 unsigned Reg = AMDGPU::VGPR1;
2203 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2210 if (
Info.hasWorkItemIDZ()) {
2216 unsigned Reg = AMDGPU::VGPR2;
2217 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2236 if (RegIdx == ArgVGPRs.
size()) {
2243 unsigned Reg = ArgVGPRs[RegIdx];
2245 assert(Reg != AMDGPU::NoRegister);
2255 unsigned NumArgRegs) {
2258 if (RegIdx == ArgSGPRs.
size())
2261 unsigned Reg = ArgSGPRs[RegIdx];
2263 assert(Reg != AMDGPU::NoRegister);
2277 assert(Reg != AMDGPU::NoRegister);
2303 const unsigned Mask = 0x3ff;
2306 if (
Info.hasWorkItemIDX()) {
2308 Info.setWorkItemIDX(Arg);
2311 if (
Info.hasWorkItemIDY()) {
2313 Info.setWorkItemIDY(Arg);
2316 if (
Info.hasWorkItemIDZ())
2328 const unsigned Mask = 0x3ff;
2353 if (
Info.hasImplicitArgPtr())
2361 if (
Info.hasWorkGroupIDX())
2364 if (
Info.hasWorkGroupIDY())
2367 if (
Info.hasWorkGroupIDZ())
2370 if (
Info.hasLDSKernelId())
2382 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2389 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2395 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2403 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2418 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2424 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2439 unsigned LastExplicitArgOffset =
2442 bool InPreloadSequence =
true;
2444 for (
auto &Arg :
F.args()) {
2445 if (!InPreloadSequence || !Arg.hasInRegAttr())
2448 int ArgIdx = Arg.getArgNo();
2451 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2452 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2455 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2456 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2458 assert(ArgLocs[ArgIdx].isMemLoc());
2459 auto &ArgLoc = ArgLocs[InIdx];
2461 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2463 unsigned NumAllocSGPRs =
2464 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2467 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2468 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2469 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2473 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2474 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2476 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2478 InPreloadSequence =
false;
2484 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2486 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2488 if (PreloadRegs->
size() > 1)
2489 RC = &AMDGPU::SGPR_32RegClass;
2490 for (
auto &Reg : *PreloadRegs) {
2496 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2505 if (
Info.hasLDSKernelId()) {
2507 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2517 bool IsShader)
const {
2525 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2527 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2531 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2532 Info.hasWorkGroupIDY() +
2533 Info.hasWorkGroupIDZ() +
2534 Info.hasWorkGroupInfo();
2535 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2537 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2542 if (!HasArchitectedSGPRs) {
2543 if (
Info.hasWorkGroupIDX()) {
2545 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2549 if (
Info.hasWorkGroupIDY()) {
2551 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2555 if (
Info.hasWorkGroupIDZ()) {
2557 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2562 if (
Info.hasWorkGroupInfo()) {
2564 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2568 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2570 unsigned PrivateSegmentWaveByteOffsetReg;
2573 PrivateSegmentWaveByteOffsetReg =
2574 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2578 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2580 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2583 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2585 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2586 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2590 Info.getNumPreloadedSGPRs() >= 16);
2605 if (HasStackObjects)
2606 Info.setHasNonSpillStackObjects(
true);
2611 HasStackObjects =
true;
2615 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2617 if (!ST.enableFlatScratch()) {
2618 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2625 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2627 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2637 Info.setScratchRSrcReg(ReservedBufferReg);
2656 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2657 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2664 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2665 if (!
MRI.isLiveIn(Reg)) {
2666 Info.setStackPtrOffsetReg(Reg);
2671 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2678 if (ST.getFrameLowering()->hasFP(MF)) {
2679 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2685 return !
Info->isEntryFunction();
2697 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2706 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2707 RC = &AMDGPU::SGPR_64RegClass;
2708 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2709 RC = &AMDGPU::SGPR_32RegClass;
2715 Entry->addLiveIn(*
I);
2720 for (
auto *Exit : Exits)
2722 TII->get(TargetOpcode::COPY), *
I)
2740 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2759 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2760 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2766 !
Info->hasWorkGroupIDZ());
2785 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2786 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2789 Info->markPSInputAllocated(0);
2790 Info->markPSInputEnabled(0);
2801 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2802 if ((PsInputBits & 0x7F) == 0 ||
2803 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2806 }
else if (IsKernel) {
2809 Splits.
append(Ins.begin(), Ins.end());
2823 }
else if (!IsGraphics) {
2848 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2858 if (IsEntryFunc && VA.
isMemLoc()) {
2881 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2885 int64_t OffsetDiff =
Offset - AlignDownOffset;
2892 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2903 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2904 Ins[i].Flags.isSExt(), &Ins[i]);
2912 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2915 if (PreloadRegs.
size() == 1) {
2916 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2921 TRI->getRegSizeInBits(*RC)));
2929 for (
auto Reg : PreloadRegs) {
2936 PreloadRegs.size()),
2945 NewArg = convertArgType(DAG, VT, MemVT,
DL, CMemVT,
2946 Ins[i].Flags.isSExt(), &Ins[i]);
2951 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
2952 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2957 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
2970 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
2971 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
2982 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
2983 RC = &AMDGPU::VGPR_32RegClass;
2984 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
2985 RC = &AMDGPU::SGPR_32RegClass;
3038 auto &ArgUsageInfo =
3043 Info->setBytesInStackArgArea(StackArgSize);
3045 return Chains.
empty() ? Chain :
3069 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3070 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3071 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3094 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3112 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.
size();
I !=
E;
3113 ++
I, ++RealRVLocIdx) {
3117 SDValue Arg = OutVals[RealRVLocIdx];
3145 if (!
Info->isEntryFunction()) {
3151 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3153 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3169 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3186 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
3252 auto &ArgUsageInfo =
3254 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3283 std::tie(OutgoingArg, ArgRC, ArgTy) =
3291 std::tie(IncomingArg, IncomingArgRC, Ty) =
3293 assert(IncomingArgRC == ArgRC);
3296 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3304 InputReg = getImplicitArgPtr(DAG,
DL);
3306 std::optional<uint32_t> Id =
3308 if (Id.has_value()) {
3320 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3324 unsigned SpecialArgOffset =
3338 std::tie(OutgoingArg, ArgRC, Ty) =
3341 std::tie(OutgoingArg, ArgRC, Ty) =
3344 std::tie(OutgoingArg, ArgRC, Ty) =
3359 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3360 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3361 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3378 InputReg = InputReg.
getNode() ?
3387 InputReg = InputReg.
getNode() ?
3391 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3392 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3402 IncomingArgX ? *IncomingArgX :
3403 IncomingArgY ? *IncomingArgY :
3404 *IncomingArgZ, ~0u);
3411 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3452 if (Callee->isDivergent())
3459 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3463 if (!CallerPreserved)
3466 bool CCMatch = CallerCC == CalleeCC;
3479 if (Arg.hasByValAttr())
3493 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3494 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3503 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3537 if (IsChainCallConv) {
3541 RequestedExec = CLI.
Args.back();
3542 assert(RequestedExec.
Node &&
"No node for EXEC");
3547 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3548 CLI.
Outs.pop_back();
3552 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3553 CLI.
Outs.pop_back();
3558 "Haven't popped all the pieces of the EXEC mask");
3569 bool IsSibCall =
false;
3570 bool IsThisReturn =
false;
3575 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
3584 "unsupported call to variadic function ");
3592 "unsupported required tail call to function ");
3597 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3601 "site marked musttail or on llvm.amdgcn.cs.chain");
3608 if (!TailCallOpt && IsTailCall)
3653 if (!IsSibCall || IsChainCallConv) {
3660 RegsToPass.emplace_back(IsChainCallConv
3661 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3662 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3669 MVT PtrVT = MVT::i32;
3672 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3700 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3708 int32_t
Offset = LocMemOffset;
3715 unsigned OpSize = Flags.isByVal() ?
3721 ? Flags.getNonZeroByValAlign()
3748 if (Outs[i].Flags.isByVal()) {
3750 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3753 Outs[i].Flags.getNonZeroByValAlign(),
3761 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3767 if (!MemOpChains.
empty())
3773 for (
auto &RegToPass : RegsToPass) {
3775 RegToPass.second, InGlue);
3784 if (IsTailCall && !IsSibCall) {
3789 std::vector<SDValue> Ops;
3790 Ops.push_back(Chain);
3791 Ops.push_back(Callee);
3808 if (IsChainCallConv)
3809 Ops.push_back(RequestedExec.
Node);
3813 for (
auto &RegToPass : RegsToPass) {
3815 RegToPass.second.getValueType()));
3820 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3821 assert(Mask &&
"Missing call preserved mask for calling convention");
3825 Ops.push_back(InGlue);
3844 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3849 Chain = Call.getValue(0);
3850 InGlue = Call.getValue(1);
3852 uint64_t CalleePopBytes = NumBytes;
3860 InVals, IsThisReturn,
3861 IsThisReturn ? OutVals[0] :
SDValue());
3872 EVT VT =
Op.getValueType();
3887 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3898 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3899 if (Alignment && *Alignment > StackAlign) {
3920 if (isa<ConstantSDNode>(
Size))
3927 if (
Op.getValueType() != MVT::i32)
3946 assert(
Op.getValueType() == MVT::i32);
3955 Op.getOperand(0), IntrinID, GetRoundBothImm);
3989 SDValue RoundModeTimesNumBits =
4009 TableEntry, EnumOffset);
4015 if (
Op->isDivergent())
4018 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4034 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4035 EVT SrcVT = Src.getValueType();
4044 EVT DstVT =
Op.getValueType();
4054 .
Case(
"m0", AMDGPU::M0)
4055 .
Case(
"exec", AMDGPU::EXEC)
4056 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4057 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4058 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4059 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4060 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4063 if (Reg == AMDGPU::NoRegister) {
4077 case AMDGPU::EXEC_LO:
4078 case AMDGPU::EXEC_HI:
4079 case AMDGPU::FLAT_SCR_LO:
4080 case AMDGPU::FLAT_SCR_HI:
4085 case AMDGPU::FLAT_SCR:
4104 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4113static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4135 auto Next = std::next(
I);
4148 return std::pair(LoopBB, RemainderBB);
4155 auto I =
MI.getIterator();
4156 auto E = std::next(
I);
4178 Src->setIsKill(
false);
4194 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4197 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4219 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4220 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4229 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4230 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4231 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4232 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4240 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4247 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4251 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4256 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4257 : AMDGPU::S_AND_SAVEEXEC_B64),
4261 MRI.setSimpleHint(NewExec, CondReg);
4263 if (UseGPRIdxMode) {
4265 SGPRIdxReg = CurrentIdxReg;
4267 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4268 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4275 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4278 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4285 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4287 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4288 : AMDGPU::S_XOR_B64_term), Exec)
4309 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4310 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4318 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4320 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4321 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4322 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4323 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4338 InitResultReg, DstReg, PhiReg, TmpExec,
4339 Offset, UseGPRIdxMode, SGPRIdxReg);
4356static std::pair<unsigned, int>
4361 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4366 return std::pair(AMDGPU::sub0,
Offset);
4380 assert(
Idx->getReg() != AMDGPU::NoRegister);
4401 return Idx->getReg();
4403 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4420 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4421 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4430 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4433 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4437 if (UseGPRIdxMode) {
4444 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4457 MI.eraseFromParent();
4466 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4467 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4473 UseGPRIdxMode, SGPRIdxReg);
4477 if (UseGPRIdxMode) {
4479 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4481 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4486 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4491 MI.eraseFromParent();
4508 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4519 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4521 if (
Idx->getReg() == AMDGPU::NoRegister) {
4532 MI.eraseFromParent();
4537 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4541 if (UseGPRIdxMode) {
4545 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4554 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4555 TRI.getRegSizeInBits(*VecRC), 32,
false);
4561 MI.eraseFromParent();
4571 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4575 UseGPRIdxMode, SGPRIdxReg);
4578 if (UseGPRIdxMode) {
4580 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4582 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4588 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4589 TRI.getRegSizeInBits(*VecRC), 32,
false);
4590 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4596 MI.eraseFromParent();
4611 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4639 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4640 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4642 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4643 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4644 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4646 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4647 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4649 bool IsWave32 = ST.isWave32();
4650 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4651 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4656 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4659 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4664 I = ComputeLoop->end();
4666 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4670 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4671 .
addReg(TmpSReg->getOperand(0).getReg())
4675 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4676 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4677 .
addReg(ActiveBits->getOperand(0).getReg());
4678 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4679 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4681 .
addReg(FF1->getOperand(0).getReg());
4682 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4684 .
addReg(LaneValue->getOperand(0).getReg());
4687 unsigned BITSETOpc =
4688 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4689 auto NewActiveBits =
4690 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4691 .
addReg(FF1->getOperand(0).getReg())
4692 .
addReg(ActiveBits->getOperand(0).getReg());
4695 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4696 .addMBB(ComputeLoop);
4697 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4698 .addMBB(ComputeLoop);
4701 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4703 .
addReg(NewActiveBits->getOperand(0).getReg())
4705 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4710 MI.eraseFromParent();
4721 switch (
MI.getOpcode()) {
4722 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4724 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4726 case AMDGPU::S_UADDO_PSEUDO:
4727 case AMDGPU::S_USUBO_PSEUDO: {
4734 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4736 : AMDGPU::S_SUB_I32;
4743 MI.eraseFromParent();
4746 case AMDGPU::S_ADD_U64_PSEUDO:
4747 case AMDGPU::S_SUB_U64_PSEUDO: {
4756 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4758 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4766 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4767 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4770 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4772 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4775 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4777 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4779 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4780 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4793 MI.eraseFromParent();
4796 case AMDGPU::V_ADD_U64_PSEUDO:
4797 case AMDGPU::V_SUB_U64_PSEUDO: {
4803 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4809 if (IsAdd && ST.hasLshlAddB64()) {
4815 TII->legalizeOperands(*
Add);
4816 MI.eraseFromParent();
4820 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4822 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4823 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4825 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
4826 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
4830 : &AMDGPU::VReg_64RegClass;
4833 : &AMDGPU::VReg_64RegClass;
4836 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4838 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4841 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4843 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4846 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4848 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4850 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4857 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4871 TII->legalizeOperands(*LoHalf);
4872 TII->legalizeOperands(*HiHalf);
4873 MI.eraseFromParent();
4876 case AMDGPU::S_ADD_CO_PSEUDO:
4877 case AMDGPU::S_SUB_CO_PSEUDO: {
4891 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4892 ? AMDGPU::S_ADDC_U32
4893 : AMDGPU::S_SUBB_U32;
4895 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4896 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4901 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4902 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4906 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4908 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4914 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
4915 assert(WaveSize == 64 || WaveSize == 32);
4917 if (WaveSize == 64) {
4918 if (ST.hasScalarCompareEq64()) {
4924 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
4926 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4928 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4929 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4931 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
4948 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
4954 MI.eraseFromParent();
4957 case AMDGPU::SI_INIT_M0: {
4959 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4960 .
add(
MI.getOperand(0));
4961 MI.eraseFromParent();
4964 case AMDGPU::GET_GROUPSTATICSIZE: {
4969 .
add(
MI.getOperand(0))
4971 MI.eraseFromParent();
4974 case AMDGPU::GET_SHADERCYCLESHILO: {
4988 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4992 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4996 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5003 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5008 .
add(
MI.getOperand(0))
5013 MI.eraseFromParent();
5016 case AMDGPU::SI_INDIRECT_SRC_V1:
5017 case AMDGPU::SI_INDIRECT_SRC_V2:
5018 case AMDGPU::SI_INDIRECT_SRC_V4:
5019 case AMDGPU::SI_INDIRECT_SRC_V8:
5020 case AMDGPU::SI_INDIRECT_SRC_V9:
5021 case AMDGPU::SI_INDIRECT_SRC_V10:
5022 case AMDGPU::SI_INDIRECT_SRC_V11:
5023 case AMDGPU::SI_INDIRECT_SRC_V12:
5024 case AMDGPU::SI_INDIRECT_SRC_V16:
5025 case AMDGPU::SI_INDIRECT_SRC_V32:
5027 case AMDGPU::SI_INDIRECT_DST_V1:
5028 case AMDGPU::SI_INDIRECT_DST_V2:
5029 case AMDGPU::SI_INDIRECT_DST_V4:
5030 case AMDGPU::SI_INDIRECT_DST_V8:
5031 case AMDGPU::SI_INDIRECT_DST_V9:
5032 case AMDGPU::SI_INDIRECT_DST_V10:
5033 case AMDGPU::SI_INDIRECT_DST_V11:
5034 case AMDGPU::SI_INDIRECT_DST_V12:
5035 case AMDGPU::SI_INDIRECT_DST_V16:
5036 case AMDGPU::SI_INDIRECT_DST_V32:
5038 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5039 case AMDGPU::SI_KILL_I1_PSEUDO:
5041 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5050 Register SrcCond =
MI.getOperand(3).getReg();
5052 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5053 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5054 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5055 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5059 : &AMDGPU::VReg_64RegClass;
5062 : &AMDGPU::VReg_64RegClass;
5065 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5067 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5070 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5072 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5075 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5077 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5099 MI.eraseFromParent();
5102 case AMDGPU::SI_BR_UNDEF: {
5106 .
add(
MI.getOperand(0));
5108 MI.eraseFromParent();
5111 case AMDGPU::ADJCALLSTACKUP:
5112 case AMDGPU::ADJCALLSTACKDOWN: {
5119 case AMDGPU::SI_CALL_ISEL: {
5123 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5126 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5132 MI.eraseFromParent();
5135 case AMDGPU::V_ADD_CO_U32_e32:
5136 case AMDGPU::V_SUB_CO_U32_e32:
5137 case AMDGPU::V_SUBREV_CO_U32_e32: {
5140 unsigned Opc =
MI.getOpcode();
5142 bool NeedClampOperand =
false;
5143 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5145 NeedClampOperand =
true;
5149 if (
TII->isVOP3(*
I)) {
5154 I.add(
MI.getOperand(1))
5155 .add(
MI.getOperand(2));
5156 if (NeedClampOperand)
5159 TII->legalizeOperands(*
I);
5161 MI.eraseFromParent();
5164 case AMDGPU::V_ADDC_U32_e32:
5165 case AMDGPU::V_SUBB_U32_e32:
5166 case AMDGPU::V_SUBBREV_U32_e32:
5169 TII->legalizeOperands(
MI);
5171 case AMDGPU::DS_GWS_INIT:
5172 case AMDGPU::DS_GWS_SEMA_BR:
5173 case AMDGPU::DS_GWS_BARRIER:
5174 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5176 case AMDGPU::DS_GWS_SEMA_V:
5177 case AMDGPU::DS_GWS_SEMA_P:
5178 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5186 case AMDGPU::S_SETREG_B32: {
5201 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5202 const unsigned SetMask = WidthMask <<
Offset;
5205 unsigned SetDenormOp = 0;
5206 unsigned SetRoundOp = 0;
5214 SetRoundOp = AMDGPU::S_ROUND_MODE;
5215 SetDenormOp = AMDGPU::S_DENORM_MODE;
5217 SetRoundOp = AMDGPU::S_ROUND_MODE;
5219 SetDenormOp = AMDGPU::S_DENORM_MODE;
5222 if (SetRoundOp || SetDenormOp) {
5225 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5226 unsigned ImmVal = Def->getOperand(1).getImm();
5240 MI.eraseFromParent();
5249 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5253 case AMDGPU::S_INVERSE_BALLOT_U32:
5254 case AMDGPU::S_INVERSE_BALLOT_U64: {
5259 const Register DstReg =
MI.getOperand(0).getReg();
5260 Register MaskReg =
MI.getOperand(1).getReg();
5262 const bool IsVALU =
TRI->isVectorRegister(
MRI, MaskReg);
5265 MaskReg =
TII->readlaneVGPRToSGPR(MaskReg,
MI,
MRI);
5269 MI.eraseFromParent();
5272 case AMDGPU::ENDPGM_TRAP: {
5275 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5293 MI.eraseFromParent();
5302 switch (
Op.getValue(0).getSimpleValueType().SimpleTy) {
5337 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5424 EVT VT =
N->getValueType(0);
5428 if (VT == MVT::f16) {
5444 unsigned Opc =
Op.getOpcode();
5445 EVT VT =
Op.getValueType();
5446 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5447 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5448 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5449 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5467 unsigned Opc =
Op.getOpcode();
5468 EVT VT =
Op.getValueType();
5469 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5470 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5471 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5472 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5491 unsigned Opc =
Op.getOpcode();
5492 EVT VT =
Op.getValueType();
5493 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5494 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5495 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5496 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5497 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5498 VT == MVT::v32bf16);
5504 : std::pair(Op0, Op0);
5523 switch (
Op.getOpcode()) {
5529 assert((!Result.getNode() ||
5530 Result.getNode()->getNumValues() == 2) &&
5531 "Load should return a value and a chain");
5535 EVT VT =
Op.getValueType();
5537 return lowerFSQRTF32(
Op, DAG);
5539 return lowerFSQRTF64(
Op, DAG);
5544 return LowerTrig(
Op, DAG);
5553 return LowerGlobalAddress(MFI,
Op, DAG);
5560 return lowerINSERT_SUBVECTOR(
Op, DAG);
5562 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5564 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5566 return lowerVECTOR_SHUFFLE(
Op, DAG);
5568 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5570 return lowerBUILD_VECTOR(
Op, DAG);
5573 return lowerFP_ROUND(
Op, DAG);
5578 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
5582 int RoundMode =
Op.getConstantOperandVal(1);
5590 return DAG.
getNode(Opc,
DL,
Op.getNode()->getVTList(),
Op->getOperand(0));
5593 return lowerTRAP(
Op, DAG);
5595 return lowerDEBUGTRAP(
Op, DAG);
5603 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5606 return lowerFLDEXP(
Op, DAG);
5631 return lowerMUL(
Op, DAG);
5634 return lowerXMULO(
Op, DAG);
5637 return lowerXMUL_LOHI(
Op, DAG);
5664 EVT FittingLoadVT = LoadVT;
5696SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5700 bool IsIntrinsic)
const {
5704 EVT LoadVT =
M->getValueType(0);
5706 EVT EquivLoadVT = LoadVT;
5725 VTList, Ops,
M->getMemoryVT(),
5726 M->getMemOperand());
5737 EVT LoadVT =
M->getValueType(0);
5743 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5744 bool IsTFE =
M->getNumValues() == 3;
5763 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand());
5766 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
5767 M->getMemOperand(), DAG);
5772 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
5773 M->getMemOperand(), DAG);
5781 EVT VT =
N->getValueType(0);
5782 unsigned CondCode =
N->getConstantOperandVal(3);
5793 EVT CmpVT =
LHS.getValueType();
5794 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
5815 EVT VT =
N->getValueType(0);
5817 unsigned CondCode =
N->getConstantOperandVal(3);
5826 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
5844 EVT VT =
N->getValueType(0);
5851 Src.getOperand(1), Src.getOperand(2));
5862 Exec = AMDGPU::EXEC_LO;
5864 Exec = AMDGPU::EXEC;
5882 switch (
N->getOpcode()) {
5894 unsigned IID =
N->getConstantOperandVal(0);
5896 case Intrinsic::amdgcn_make_buffer_rsrc:
5897 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
5899 case Intrinsic::amdgcn_cvt_pkrtz: {
5908 case Intrinsic::amdgcn_cvt_pknorm_i16:
5909 case Intrinsic::amdgcn_cvt_pknorm_u16:
5910 case Intrinsic::amdgcn_cvt_pk_i16:
5911 case Intrinsic::amdgcn_cvt_pk_u16: {
5917 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
5919 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
5921 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
5926 EVT VT =
N->getValueType(0);
5935 case Intrinsic::amdgcn_s_buffer_load: {
5947 EVT VT =
Op.getValueType();
5948 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
5960 if (!
Offset->isDivergent()) {
5979 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
5991 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
5992 Results.push_back(Res.getOperand(
I));
5996 Results.push_back(Res.getValue(1));
6005 EVT VT =
N->getValueType(0);
6010 EVT SelectVT = NewVT;
6011 if (NewVT.
bitsLT(MVT::i32)) {
6014 SelectVT = MVT::i32;
6020 if (NewVT != SelectVT)
6026 if (
N->getValueType(0) != MVT::v2f16)
6039 if (
N->getValueType(0) != MVT::v2f16)
6052 if (
N->getValueType(0) != MVT::f16)
6070 if (
I.getUse().get() !=
Value)
6073 if (
I->getOpcode() == Opcode)
6079unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6081 switch (
Intr->getConstantOperandVal(1)) {
6082 case Intrinsic::amdgcn_if:
6084 case Intrinsic::amdgcn_else:
6086 case Intrinsic::amdgcn_loop:
6088 case Intrinsic::amdgcn_end_cf:
6137 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6150 assert(BR &&
"brcond missing unconditional branch user");
6151 Target = BR->getOperand(1);
6154 unsigned CFNode = isCFIntrinsic(
Intr);
6173 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6203 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6220 Intr->getOperand(0));
6227 MVT VT =
Op.getSimpleValueType();
6230 if (
Op.getConstantOperandVal(0) != 0)
6236 if (
Info->isEntryFunction())
6254 return Op.getValueType().bitsLE(VT) ?
6261 assert(
Op.getValueType() == MVT::f16 &&
6262 "Do not know how to custom lower FP_ROUND for non-f16 type");
6265 EVT SrcVT = Src.getValueType();
6266 if (SrcVT != MVT::f64)
6282 EVT VT =
Op.getValueType();
6285 bool IsIEEEMode =
Info->getMode().IEEE;
6294 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6302 EVT VT =
Op.getValueType();
6306 EVT ExpVT =
Exp.getValueType();
6307 if (ExpVT == MVT::i16)
6328 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6336 EVT VT =
Op.getValueType();
6342 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6369 if (
Op->isDivergent())
6382 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6384 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6387 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6389 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6395 EVT VT =
Op.getValueType();
6402 const APInt &
C = RHSC->getAPIntValue();
6404 if (
C.isPowerOf2()) {
6406 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6411 SL, VT, Result, ShiftAmt),
6431 if (
Op->isDivergent()) {
6448 return lowerTrapEndpgm(
Op, DAG);
6451 lowerTrapHsaQueuePtr(
Op, DAG);
6454SDValue SITargetLowering::lowerTrapEndpgm(
6462 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6472SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6482 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6488 if (UserSGPR == AMDGPU::NoRegister) {
6513SDValue SITargetLowering::lowerTrapHsa(
6534 "debugtrap handler not supported",
6550SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6554 ? AMDGPU::SRC_SHARED_BASE
6555 : AMDGPU::SRC_PRIVATE_BASE;
6578 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6587 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
6593 if (UserSGPR == AMDGPU::NoRegister) {
6600 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6623 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6624 isa<BasicBlockSDNode>(Val))
6627 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6628 return ConstVal->getSExtValue() !=
TM.getNullPointerValue(AddrSpace);
6658 unsigned NullVal =
TM.getNullPointerValue(DestAS);
6680 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
6692 Op.getValueType() == MVT::i64) {
6701 Src.getValueType() == MVT::i64)
6725 EVT InsVT =
Ins.getValueType();
6728 unsigned IdxVal =
Idx->getAsZExtVal();
6733 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
6738 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6740 MVT::i32, InsNumElts / 2);
6745 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
6747 if (InsNumElts == 2) {
6760 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
6782 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
6783 if (NumElts == 4 && EltSize == 16 && KIdx) {
6794 unsigned Idx = KIdx->getZExtValue();
6795 bool InsertLo =
Idx < 2;
6797 InsertLo ? LoVec : HiVec,
6812 if (isa<ConstantSDNode>(
Idx))
6818 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
6824 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6840 DAG.
getNOT(SL, BFM, IntVT), BCVec);
6852 EVT ResultVT =
Op.getValueType();
6865 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
6868 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
6873 if (VecSize == 128) {
6881 }
else if (VecSize == 256) {
6884 for (
unsigned P = 0;
P < 4; ++
P) {
6890 Parts[0], Parts[1]));
6892 Parts[2], Parts[3]));
6898 for (
unsigned P = 0;
P < 8; ++
P) {
6905 Parts[0], Parts[1], Parts[2], Parts[3]));
6908 Parts[4], Parts[5],Parts[6], Parts[7]));
6911 EVT IdxVT =
Idx.getValueType();
6928 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
6943 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
6953 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
6959 EVT ResultVT =
Op.getValueType();
6962 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
6964 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
6980 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
6981 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
6989 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
6990 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
6991 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
6992 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7011 EVT ResultVT =
Op.getValueType();
7027 EVT VT =
Op.getValueType();
7029 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7030 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7049 { CastLo, CastHi });
7053 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7060 for (
unsigned P = 0;
P < 4; ++
P)
7061 Parts[
P].push_back(
Op.getOperand(
I +
P *
E));
7064 for (
unsigned P = 0;
P < 4; ++
P) {
7074 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7081 for (
unsigned P = 0;
P < 8; ++
P)
7082 Parts[
P].push_back(
Op.getOperand(
I +
P *
E));
7085 for (
unsigned P = 0;
P < 8; ++
P) {
7095 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7136 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7174 EVT PtrVT =
Op.getValueType();
7190 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7263 SDValue Param = lowerKernargMemParameter(
7273 "non-hsa intrinsic with hsa target",
7282 "intrinsic not supported on subtarget",
7292 unsigned NumElts = Elts.
size();
7294 if (NumElts <= 12) {
7303 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7309 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7310 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7319 EVT SrcVT = Src.getValueType();
7340 bool Unpacked,
bool IsD16,
int DMaskPop,
7341 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7344 EVT ReqRetVT = ResultTypes[0];
7346 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7347 ? (ReqRetNumElts + 1) / 2
7350 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7351 DMaskPop : (DMaskPop + 1) / 2;
7353 MVT DataDwordVT = NumDataDwords == 1 ?
7356 MVT MaskPopVT = MaskPopDwords == 1 ?
7362 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7373 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7375 NumDataDwords - MaskPopDwords);
7380 EVT LegalReqRetVT = ReqRetVT;
7382 if (!
Data.getValueType().isInteger())
7384 Data.getValueType().changeTypeToInteger(),
Data);
7405 if (Result->getNumValues() == 1)
7412 SDValue *LWE,
bool &IsTexFail) {
7413 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7432 unsigned DimIdx,
unsigned EndIdx,
7433 unsigned NumGradients) {
7435 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7443 if (((
I + 1) >= EndIdx) ||
7444 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7445 I == DimIdx + NumGradients - 1))) {
7446 if (
Addr.getValueType() != MVT::i16)
7467 unsigned IntrOpcode =
Intr->BaseOpcode;
7479 bool AdjustRetType =
false;
7480 bool IsAtomicPacked16Bit =
false;
7483 const unsigned ArgOffset = WithChain ? 2 : 1;
7486 unsigned DMaskLanes = 0;
7488 if (BaseOpcode->Atomic) {
7489 VData =
Op.getOperand(2);
7491 IsAtomicPacked16Bit =
7492 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7493 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7496 if (BaseOpcode->AtomicX2) {
7503 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7504 DMask = Is64Bit ? 0xf : 0x3;
7505 NumVDataDwords = Is64Bit ? 4 : 2;
7507 DMask = Is64Bit ? 0x3 : 0x1;
7508 NumVDataDwords = Is64Bit ? 2 : 1;
7511 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7514 if (BaseOpcode->Store) {
7515 VData =
Op.getOperand(2);
7523 VData = handleD16VData(VData, DAG,
true);
7540 (!LoadVT.
isVector() && DMaskLanes > 1))
7548 NumVDataDwords = (DMaskLanes + 1) / 2;
7550 NumVDataDwords = DMaskLanes;
7552 AdjustRetType =
true;
7556 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
7561 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
7563 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7564 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7566 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
7568 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7569 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7572 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
7573 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
7574 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
7579 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
7583 "Bias needs to be converted to 16 bit in A16 mode");
7588 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
7592 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
7593 "require 16 bit args for both gradients and addresses");
7598 if (!
ST->hasA16()) {
7599 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
7600 "support 16 bit addresses\n");
7610 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
7614 IntrOpcode = G16MappingInfo->
G16;
7622 ArgOffset +
Intr->GradientStart,
7623 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
7625 for (
unsigned I = ArgOffset +
Intr->GradientStart;
7626 I < ArgOffset + Intr->CoordStart;
I++)
7633 ArgOffset +
Intr->CoordStart, VAddrEnd,
7637 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
7655 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
7656 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
7657 const bool UseNSA =
ST->hasNSAEncoding() &&
7658 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
7659 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
7660 const bool UsePartialNSA =
7661 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
7664 if (UsePartialNSA) {
7666 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7675 if (!BaseOpcode->Sampler) {
7679 cast<ConstantSDNode>(
Op.getOperand(ArgOffset +
Intr->UnormIndex));
7681 Unorm = UnormConst->getZExtValue() ? True : False;
7686 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
7687 bool IsTexFail =
false;
7688 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7699 NumVDataDwords += 1;
7700 AdjustRetType =
true;
7705 if (AdjustRetType) {
7707 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7710 if (isa<MemSDNode>(
Op))
7715 EVT NewVT = NumVDataDwords > 1 ?
7719 ResultTypes[0] = NewVT;
7720 if (ResultTypes.size() == 3) {
7724 ResultTypes.erase(&ResultTypes[1]);
7728 unsigned CPol = cast<ConstantSDNode>(
7729 Op.getOperand(ArgOffset +
Intr->CachePolicyIndex))->getZExtValue();
7730 if (BaseOpcode->Atomic)
7737 if (BaseOpcode->Store || BaseOpcode->Atomic)
7739 if (UsePartialNSA) {
7748 if (BaseOpcode->Sampler)
7753 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7757 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7765 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7769 if (BaseOpcode->HasD16)
7771 if (isa<MemSDNode>(
Op))
7774 int NumVAddrDwords =
7780 NumVDataDwords, NumVAddrDwords);
7781 }
else if (IsGFX11Plus) {
7783 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7784 : AMDGPU::MIMGEncGfx11Default,
7785 NumVDataDwords, NumVAddrDwords);
7786 }
else if (IsGFX10Plus) {
7788 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7789 : AMDGPU::MIMGEncGfx10Default,
7790 NumVDataDwords, NumVAddrDwords);
7794 NumVDataDwords, NumVAddrDwords);
7797 "requested image instruction is not supported on this GPU");
7802 NumVDataDwords, NumVAddrDwords);
7805 NumVDataDwords, NumVAddrDwords);
7811 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
7816 if (BaseOpcode->AtomicX2) {
7821 if (BaseOpcode->Store)
7825 NumVDataDwords, IsAtomicPacked16Bit,
DL);
7843 if (!
Offset->isDivergent()) {
7888 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
7892 unsigned NumLoads = 1;
7898 if (NumElts == 8 || NumElts == 16) {
7899 NumLoads = NumElts / 4;
7907 setBufferOffsets(
Offset, DAG, &Ops[3],
7908 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
7911 for (
unsigned i = 0; i < NumLoads; ++i) {
7917 if (NumElts == 8 || NumElts == 16)
7964 EVT VT =
Op.getValueType();
7966 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
7970 switch (IntrinsicID) {
7971 case Intrinsic::amdgcn_implicit_buffer_ptr: {
7974 return getPreloadedValue(DAG, *MFI, VT,
7977 case Intrinsic::amdgcn_dispatch_ptr:
7978 case Intrinsic::amdgcn_queue_ptr: {
7981 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
7987 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
7989 return getPreloadedValue(DAG, *MFI, VT, RegID);
7991 case Intrinsic::amdgcn_implicitarg_ptr: {
7993 return getImplicitArgPtr(DAG,
DL);
7994 return getPreloadedValue(DAG, *MFI, VT,
7997 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8003 return getPreloadedValue(DAG, *MFI, VT,
8006 case Intrinsic::amdgcn_dispatch_id: {
8009 case Intrinsic::amdgcn_rcp:
8011 case Intrinsic::amdgcn_rsq:
8013 case Intrinsic::amdgcn_rsq_legacy:
8017 case Intrinsic::amdgcn_rcp_legacy:
8021 case Intrinsic::amdgcn_rsq_clamp: {
8035 case Intrinsic::r600_read_ngroups_x:
8039 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8042 case Intrinsic::r600_read_ngroups_y:
8046 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8049 case Intrinsic::r600_read_ngroups_z:
8053 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8056 case Intrinsic::r600_read_global_size_x:
8060 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8063 case Intrinsic::r600_read_global_size_y:
8067 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8070 case Intrinsic::r600_read_global_size_z:
8074 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8077 case Intrinsic::r600_read_local_size_x:
8081 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8083 case Intrinsic::r600_read_local_size_y:
8087 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8089 case Intrinsic::r600_read_local_size_z:
8093 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8095 case Intrinsic::amdgcn_workgroup_id_x:
8096 return getPreloadedValue(DAG, *MFI, VT,
8098 case Intrinsic::amdgcn_workgroup_id_y:
8099 return getPreloadedValue(DAG, *MFI, VT,
8101 case Intrinsic::amdgcn_workgroup_id_z:
8102 return getPreloadedValue(DAG, *MFI, VT,
8104 case Intrinsic::amdgcn_wave_id:
8105 return lowerWaveID(DAG,
Op);
8106 case Intrinsic::amdgcn_lds_kernel_id: {
8108 return getLDSKernelId(DAG,
DL);
8109 return getPreloadedValue(DAG, *MFI, VT,
8112 case Intrinsic::amdgcn_workitem_id_x:
8113 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8114 case Intrinsic::amdgcn_workitem_id_y:
8115 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8116 case Intrinsic::amdgcn_workitem_id_z:
8117 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8118 case Intrinsic::amdgcn_wavefrontsize:
8121 case Intrinsic::amdgcn_s_buffer_load: {
8122 unsigned CPol =
Op.getConstantOperandVal(3);
8129 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8132 case Intrinsic::amdgcn_fdiv_fast:
8133 return lowerFDIV_FAST(
Op, DAG);
8134 case Intrinsic::amdgcn_sin:
8137 case Intrinsic::amdgcn_cos:
8140 case Intrinsic::amdgcn_mul_u24:
8142 case Intrinsic::amdgcn_mul_i24:
8145 case Intrinsic::amdgcn_log_clamp: {
8151 case Intrinsic::amdgcn_fract:
8154 case Intrinsic::amdgcn_class:
8156 Op.getOperand(1),
Op.getOperand(2));
8157 case Intrinsic::amdgcn_div_fmas:
8159 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8162 case Intrinsic::amdgcn_div_fixup:
8164 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8166 case Intrinsic::amdgcn_div_scale: {
8179 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8182 Denominator, Numerator);
8184 case Intrinsic::amdgcn_icmp: {
8186 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8187 Op.getConstantOperandVal(2) == 0 &&
8192 case Intrinsic::amdgcn_fcmp: {
8195 case Intrinsic::amdgcn_ballot:
8197 case Intrinsic::amdgcn_fmed3:
8199 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8200 case Intrinsic::amdgcn_fdot2:
8202 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8204 case Intrinsic::amdgcn_fmul_legacy:
8206 Op.getOperand(1),
Op.getOperand(2));
8207 case Intrinsic::amdgcn_sffbh:
8209 case Intrinsic::amdgcn_sbfe:
8211 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8212 case Intrinsic::amdgcn_ubfe:
8214 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8215 case Intrinsic::amdgcn_cvt_pkrtz:
8216 case Intrinsic::amdgcn_cvt_pknorm_i16:
8217 case Intrinsic::amdgcn_cvt_pknorm_u16:
8218 case Intrinsic::amdgcn_cvt_pk_i16:
8219 case Intrinsic::amdgcn_cvt_pk_u16: {
8221 EVT VT =
Op.getValueType();
8224 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8226 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8228 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8230 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8236 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8239 Op.getOperand(1),
Op.getOperand(2));
8242 case Intrinsic::amdgcn_fmad_ftz:
8244 Op.getOperand(2),
Op.getOperand(3));
8246 case Intrinsic::amdgcn_if_break:
8248 Op->getOperand(1),
Op->getOperand(2)), 0);
8250 case Intrinsic::amdgcn_groupstaticsize: {
8262 case Intrinsic::amdgcn_is_shared:
8263 case Intrinsic::amdgcn_is_private: {
8265 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8267 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8275 case Intrinsic::amdgcn_perm:
8277 Op.getOperand(2),
Op.getOperand(3));
8278 case Intrinsic::amdgcn_reloc_constant: {
8282 auto RelocSymbol = cast<GlobalVariable>(
8288 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8289 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8290 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8291 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8292 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8293 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8294 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8295 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8296 if (
Op.getOperand(4).getValueType() == MVT::i32)
8302 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8303 Op.getOperand(3), IndexKeyi32);
8305 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8306 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8307 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8308 if (
Op.getOperand(6).getValueType() == MVT::i32)
8314 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8315 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8316 IndexKeyi32, Op.getOperand(7)});
8321 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8332 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8338 unsigned NewOpcode)
const {
8342 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8343 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8357 auto *
M = cast<MemSDNode>(
Op);
8361 M->getMemOperand());
8372 unsigned NewOpcode)
const {
8376 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8377 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8391 auto *
M = cast<MemSDNode>(
Op);
8395 M->getMemOperand());
8400 unsigned IntrID =
Op.getConstantOperandVal(1);
8404 case Intrinsic::amdgcn_ds_ordered_add:
8405 case Intrinsic::amdgcn_ds_ordered_swap: {
8410 unsigned IndexOperand =
M->getConstantOperandVal(7);
8411 unsigned WaveRelease =
M->getConstantOperandVal(8);
8412 unsigned WaveDone =
M->getConstantOperandVal(9);
8414 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8415 IndexOperand &= ~0x3f;
8416 unsigned CountDw = 0;
8419 CountDw = (IndexOperand >> 24) & 0xf;
8420 IndexOperand &= ~(0xf << 24);
8422 if (CountDw < 1 || CountDw > 4) {
8424 "ds_ordered_count: dword count must be between 1 and 4");
8431 if (WaveDone && !WaveRelease)
8434 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8435 unsigned ShaderType =
8437 unsigned Offset0 = OrderedCountIndex << 2;
8438 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8441 Offset1 |= (CountDw - 1) << 6;
8444 Offset1 |= ShaderType << 2;
8446 unsigned Offset = Offset0 | (Offset1 << 8);
8455 M->getVTList(), Ops,
M->getMemoryVT(),
8456 M->getMemOperand());
8458 case Intrinsic::amdgcn_ds_fadd: {
8462 case Intrinsic::amdgcn_ds_fadd:
8468 M->getOperand(0),
M->getOperand(2),
M->getOperand(3),
8469 M->getMemOperand());
8471 case Intrinsic::amdgcn_ds_fmin:
8472 case Intrinsic::amdgcn_ds_fmax: {
8476 case Intrinsic::amdgcn_ds_fmin:
8479 case Intrinsic::amdgcn_ds_fmax:
8492 M->getMemoryVT(),
M->getMemOperand());
8494 case Intrinsic::amdgcn_buffer_load:
8495 case Intrinsic::amdgcn_buffer_load_format: {
8496 unsigned Glc =
Op.getConstantOperandVal(5);
8497 unsigned Slc =
Op.getConstantOperandVal(6);
8509 setBufferOffsets(
Op.getOperand(4), DAG, &Ops[3]);
8511 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8514 EVT VT =
Op.getValueType();
8516 auto *
M = cast<MemSDNode>(
Op);
8517 EVT LoadVT =
Op.getValueType();
8525 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
8526 M->getMemOperand());
8528 return getMemIntrinsicNode(Opc,
DL,
Op->getVTList(), Ops, IntVT,
8529 M->getMemOperand(), DAG);
8531 case Intrinsic::amdgcn_raw_buffer_load:
8532 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8533 case Intrinsic::amdgcn_raw_buffer_load_format:
8534 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8535 const bool IsFormat =
8536 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8537 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8539 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8540 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8553 auto *
M = cast<MemSDNode>(
Op);
8554 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8556 case Intrinsic::amdgcn_struct_buffer_load:
8557 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8558 case Intrinsic::amdgcn_struct_buffer_load_format:
8559 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8560 const bool IsFormat =
8561 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8562 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8564 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8565 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8578 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8580 case Intrinsic::amdgcn_tbuffer_load: {
8582 EVT LoadVT =
Op.getValueType();
8585 unsigned Dfmt =
Op.getConstantOperandVal(7);
8586 unsigned Nfmt =
Op.getConstantOperandVal(8);
8587 unsigned Glc =
Op.getConstantOperandVal(9);
8588 unsigned Slc =
Op.getConstantOperandVal(10);
8606 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8609 case Intrinsic::amdgcn_raw_tbuffer_load:
8610 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8612 EVT LoadVT =
Op.getValueType();
8613 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8614 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8633 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8636 case Intrinsic::amdgcn_struct_tbuffer_load:
8637 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8639 EVT LoadVT =
Op.getValueType();
8640 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8641 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8660 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8663 case Intrinsic::amdgcn_buffer_atomic_swap:
8664 case Intrinsic::amdgcn_buffer_atomic_add:
8665 case Intrinsic::amdgcn_buffer_atomic_sub:
8666 case Intrinsic::amdgcn_buffer_atomic_csub:
8667 case Intrinsic::amdgcn_buffer_atomic_smin:
8668 case Intrinsic::amdgcn_buffer_atomic_umin:
8669 case Intrinsic::amdgcn_buffer_atomic_smax:
8670 case Intrinsic::amdgcn_buffer_atomic_umax:
8671 case Intrinsic::amdgcn_buffer_atomic_and:
8672 case Intrinsic::amdgcn_buffer_atomic_or:
8673 case Intrinsic::amdgcn_buffer_atomic_xor:
8674 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8675 unsigned Slc =
Op.getConstantOperandVal(6);
8688 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
8690 EVT VT =
Op.getValueType();
8692 auto *
M = cast<MemSDNode>(
Op);
8693 unsigned Opcode = 0;
8696 case Intrinsic::amdgcn_buffer_atomic_swap:
8699 case Intrinsic::amdgcn_buffer_atomic_add:
8702 case Intrinsic::amdgcn_buffer_atomic_sub:
8705 case Intrinsic::amdgcn_buffer_atomic_csub:
8708 case Intrinsic::amdgcn_buffer_atomic_smin:
8711 case Intrinsic::amdgcn_buffer_atomic_umin:
8714 case Intrinsic::amdgcn_buffer_atomic_smax:
8717 case Intrinsic::amdgcn_buffer_atomic_umax:
8720 case Intrinsic::amdgcn_buffer_atomic_and:
8723 case Intrinsic::amdgcn_buffer_atomic_or:
8726 case Intrinsic::amdgcn_buffer_atomic_xor:
8729 case Intrinsic::amdgcn_buffer_atomic_fadd:
8737 M->getMemOperand());
8739 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8740 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8742 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8743 return lowerRawBufferAtomicIntrin(
Op, DAG,
8745 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8746 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8748 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8749 return lowerStructBufferAtomicIntrin(
Op, DAG,
8751 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8752 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8754 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8755 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8757 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8758 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8760 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8761 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8763 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8764 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8766 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8767 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8769 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8770 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8772 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8773 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8775 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8776 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8778 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8779 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8781 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8782 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8784 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8785 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8787 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8788 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8790 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8791 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8793 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8794 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8796 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8797 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8799 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8800 return lowerRawBufferAtomicIntrin(
Op, DAG,
8802 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8803 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8804 return lowerStructBufferAtomicIntrin(
Op, DAG,
8806 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8807 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8809 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8810 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8812 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8813 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8814 return lowerStructBufferAtomicIntrin(
Op, DAG,
8816 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8817 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8818 return lowerStructBufferAtomicIntrin(
Op, DAG,
8820 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8821 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8822 return lowerStructBufferAtomicIntrin(
Op, DAG,
8824 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8825 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8826 return lowerStructBufferAtomicIntrin(
Op, DAG,
8828 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8829 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8831 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8832 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8834 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8835 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8837 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8838 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8840 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8841 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8843 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8844 return lowerStructBufferAtomicIntrin(
Op, DAG,
8847 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8848 unsigned Slc =
Op.getConstantOperandVal(7);
8862 setBufferOffsets(
Op.getOperand(6), DAG, &Ops[5]);
8864 EVT VT =
Op.getValueType();
8865 auto *
M = cast<MemSDNode>(
Op);
8868 Op->getVTList(), Ops, VT,
M->getMemOperand());
8870 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8871 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
8872 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
8873 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8887 EVT VT =
Op.getValueType();
8888 auto *
M = cast<MemSDNode>(
Op);
8891 Op->getVTList(), Ops, VT,
M->getMemOperand());
8893 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
8895 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
8896 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
8910 EVT VT =
Op.getValueType();
8911 auto *
M = cast<MemSDNode>(
Op);
8914 Op->getVTList(), Ops, VT,
M->getMemOperand());
8916 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
8918 SDValue NodePtr =
M->getOperand(2);
8919 SDValue RayExtent =
M->getOperand(3);
8920 SDValue RayOrigin =
M->getOperand(4);
8922 SDValue RayInvDir =
M->getOperand(6);
8940 const unsigned NumVDataDwords = 4;
8941 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
8942 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
8946 const unsigned BaseOpcodes[2][2] = {
8947 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
8948 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
8949 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
8953 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
8954 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
8955 : AMDGPU::MIMGEncGfx10NSA,
8956 NumVDataDwords, NumVAddrDwords);
8960 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
8961 : AMDGPU::MIMGEncGfx10Default,
8962 NumVDataDwords, NumVAddrDwords);
8968 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
8971 if (Lanes[0].getValueSizeInBits() == 32) {
8972 for (
unsigned I = 0;
I < 3; ++
I)
8979 { Lanes[0], Lanes[1] })));
8986 { Elt0, Lanes[0] })));
8990 { Lanes[1], Lanes[2] })));
8995 if (UseNSA && IsGFX11Plus) {
9003 for (
unsigned I = 0;
I < 3; ++
I) {
9006 {DirLanes[I], InvDirLanes[I]})));
9021 packLanes(RayOrigin,
true);
9022 packLanes(RayDir,
true);
9023 packLanes(RayInvDir,
false);
9028 if (NumVAddrDwords > 12) {
9048 case Intrinsic::amdgcn_global_atomic_fmin:
9049 case Intrinsic::amdgcn_global_atomic_fmax:
9050 case Intrinsic::amdgcn_global_atomic_fmin_num:
9051 case Intrinsic::amdgcn_global_atomic_fmax_num:
9052 case Intrinsic::amdgcn_flat_atomic_fmin:
9053 case Intrinsic::amdgcn_flat_atomic_fmax:
9054 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9055 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9062 unsigned Opcode = 0;
9064 case Intrinsic::amdgcn_global_atomic_fmin:
9065 case Intrinsic::amdgcn_global_atomic_fmin_num:
9066 case Intrinsic::amdgcn_flat_atomic_fmin:
9067 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9071 case Intrinsic::amdgcn_global_atomic_fmax:
9072 case Intrinsic::amdgcn_global_atomic_fmax_num:
9073 case Intrinsic::amdgcn_flat_atomic_fmax:
9074 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9082 M->getVTList(), Ops,
M->getMemoryVT(),
9083 M->getMemOperand());
9085 case Intrinsic::amdgcn_s_get_barrier_state: {
9089 bool IsInlinableBarID =
false;
9092 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9093 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9097 if (IsInlinableBarID) {
9098 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9102 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9114 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9122SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9132 bool IsTFE = VTList.
NumVTs == 3;
9135 unsigned NumOpDWords = NumValueDWords + 1;
9140 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9141 OpDWordsVT, OpDWordsMMO, DAG);
9156 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9162 WidenedMemVT, WidenedMMO);
9172 bool ImageStore)
const {
9207 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9213 if ((NumElements % 2) == 1) {
9215 unsigned I = Elts.
size() / 2;
9231 if (NumElements == 3) {
9252 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9255 switch (IntrinsicID) {
9256 case Intrinsic::amdgcn_exp_compr: {
9260 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9283 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9286 case Intrinsic::amdgcn_s_barrier: {
9289 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9290 if (WGSize <=
ST.getWavefrontSize())
9292 Op.getOperand(0)), 0);
9296 if (
ST.hasSplitBarriers()) {
9301 MVT::Other, K,
Op.getOperand(0)),
9312 case Intrinsic::amdgcn_tbuffer_store: {
9316 VData = handleD16VData(VData, DAG);
9317 unsigned Dfmt =
Op.getConstantOperandVal(8);
9318 unsigned Nfmt =
Op.getConstantOperandVal(9);
9319 unsigned Glc =
Op.getConstantOperandVal(10);
9320 unsigned Slc =
Op.getConstantOperandVal(11);
9338 M->getMemoryVT(),
M->getMemOperand());
9341 case Intrinsic::amdgcn_struct_tbuffer_store:
9342 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9346 VData = handleD16VData(VData, DAG);
9347 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9348 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9366 M->getMemoryVT(),
M->getMemOperand());
9369 case Intrinsic::amdgcn_raw_tbuffer_store:
9370 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9374 VData = handleD16VData(VData, DAG);
9375 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9376 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9394 M->getMemoryVT(),
M->getMemOperand());
9397 case Intrinsic::amdgcn_buffer_store:
9398 case Intrinsic::amdgcn_buffer_store_format: {
9402 VData = handleD16VData(VData, DAG);
9403 unsigned Glc =
Op.getConstantOperandVal(6);
9404 unsigned Slc =
Op.getConstantOperandVal(7);
9417 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
9419 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9426 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9427 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9430 M->getMemoryVT(),
M->getMemOperand());
9433 case Intrinsic::amdgcn_raw_buffer_store:
9434 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9435 case Intrinsic::amdgcn_raw_buffer_store_format:
9436 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9437 const bool IsFormat =
9438 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9439 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9446 VData = handleD16VData(VData, DAG);
9456 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9457 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9477 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9480 M->getMemoryVT(),
M->getMemOperand());
9483 case Intrinsic::amdgcn_struct_buffer_store:
9484 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9485 case Intrinsic::amdgcn_struct_buffer_store_format:
9486 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9487 const bool IsFormat =
9488 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9489 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9497 VData = handleD16VData(VData, DAG);
9507 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9508 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9529 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9532 M->getMemoryVT(),
M->getMemOperand());
9534 case Intrinsic::amdgcn_raw_buffer_load_lds:
9535 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9536 case Intrinsic::amdgcn_struct_buffer_load_lds:
9537 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9541 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9542 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9543 unsigned OpOffset = HasVIndex ? 1 : 0;
9544 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9546 unsigned Size =
Op->getConstantOperandVal(4);
9552 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9553 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9554 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9555 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9558 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9559 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9560 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9561 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9564 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9565 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9566 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9567 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9575 if (HasVIndex && HasVOffset)
9581 else if (HasVOffset)
9584 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9588 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9596 auto *
M = cast<MemSDNode>(
Op);
9623 case Intrinsic::amdgcn_global_load_lds: {
9625 unsigned Size =
Op->getConstantOperandVal(4);
9630 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9633 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9636 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9640 auto *
M = cast<MemSDNode>(
Op);
9653 if (
LHS->isDivergent())
9657 RHS.getOperand(0).getValueType() == MVT::i32) {
9660 VOffset =
RHS.getOperand(0);
9665 if (!
Addr->isDivergent()) {
9681 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9701 case Intrinsic::amdgcn_end_cf:
9703 Op->getOperand(2), Chain), 0);
9704 case Intrinsic::amdgcn_s_barrier_init:
9705 case Intrinsic::amdgcn_s_barrier_join:
9706 case Intrinsic::amdgcn_s_wakeup_barrier: {
9711 bool IsInlinableBarID =
false;
9714 if (isa<ConstantSDNode>(BarOp)) {
9715 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9719 if (IsInlinableBarID) {
9720 switch (IntrinsicID) {
9723 case Intrinsic::amdgcn_s_barrier_init:
9724 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9726 case Intrinsic::amdgcn_s_barrier_join:
9727 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9729 case Intrinsic::amdgcn_s_wakeup_barrier:
9730 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9737 switch (IntrinsicID) {
9740 case Intrinsic::amdgcn_s_barrier_init:
9741 Opc = AMDGPU::S_BARRIER_INIT_M0;
9743 case Intrinsic::amdgcn_s_barrier_join:
9744 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9746 case Intrinsic::amdgcn_s_wakeup_barrier:
9747 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9752 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9758 if (!IsInlinableBarID) {
9763 Op.getOperand(2), M0Val),
9767 }
else if (!IsInlinableBarID) {
9777 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9790std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9797 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9814 unsigned Overflow = ImmOffset & ~MaxImm;
9815 ImmOffset -= Overflow;
9816 if ((int32_t)Overflow < 0) {
9817 Overflow += ImmOffset;
9826 SDValue Ops[] = { N0, OverflowVal };
9841void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
9843 Align Alignment)
const {
9846 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
9849 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9860 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
9862 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
9879SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
9882 return MaybePointer;
9901 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
9904 std::optional<uint32_t> ConstStride = std::nullopt;
9905 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
9906 ConstStride = ConstNode->getZExtValue();
9909 if (!ConstStride || *ConstStride != 0) {
9912 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
9919 NewHighHalf = DAG.
getNode(
ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
9923 NewHighHalf, NumRecords, Flags);
9930SITargetLowering::handleByteShortBufferLoads(
SelectionDAG &DAG,
EVT LoadVT,
9951 if (VDataType == MVT::f16)
9955 Ops[1] = BufferStoreExt;
9960 M->getMemOperand());
9985SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10001 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10008 "unexpected vector extload");
10021 "unexpected fp extload");
10039 DCI.AddToWorklist(Cvt.
getNode());
10044 DCI.AddToWorklist(Cvt.
getNode());
10055 if (
Info.isEntryFunction())
10056 return Info.getUserSGPRInfo().hasFlatScratchInit();
10064 EVT MemVT =
Load->getMemoryVT();
10077 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10080 BasePtr, RealMemVT, MMO);
10110 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10111 "Custom lowering for non-i32 vectors hasn't been implemented.");
10114 unsigned AS =
Load->getAddressSpace();
10133 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10150 Alignment >=
Align(4) && NumElements < 32) {
10165 if (NumElements > 4)
10185 if (NumElements > 2)
10190 if (NumElements > 4)
10202 auto Flags =
Load->getMemOperand()->getFlags();
10204 Load->getAlign(), Flags, &
Fast) &&
10213 MemVT, *
Load->getMemOperand())) {
10223 EVT VT =
Op.getValueType();
10260 EVT VT =
Op.getValueType();
10263 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10270 if (!AllowInaccurateRcp && VT != MVT::f16)
10273 if (CLHS->isExactlyValue(1.0)) {
10290 if (CLHS->isExactlyValue(-1.0)) {
10299 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10313 EVT VT =
Op.getValueType();
10316 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10318 if (!AllowInaccurateDiv)
10339 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10352 return DAG.
getNode(Opcode, SL, VTList,
10361 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10374 return DAG.
getNode(Opcode, SL, VTList,
10380 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10381 return FastLowered;
10408 const APFloat K0Val(0x1p+96f);
10411 const APFloat K1Val(0x1p-32f);
10438 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10439 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10440 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10445 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10446 return FastLowered;
10453 Flags.setNoFPExcept(
true);
10470 DenominatorScaled, Flags);
10472 DenominatorScaled, Flags);
10484 const bool HasDynamicDenormals =
10490 if (!PreservesDenormals) {
10498 if (HasDynamicDenormals) {
10502 SavedDenormMode =
SDValue(GetReg, 0);
10510 const SDValue EnableDenormValue =
10519 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10520 {EnableDenormValue,
BitField, Glue});
10533 ApproxRcp, One, NegDivScale0, Flags);
10536 ApproxRcp, Fma0, Flags);
10539 Fma1, Fma1, Flags);
10542 NumeratorScaled,
Mul, Flags);
10545 Fma2, Fma1,
Mul, Fma2, Flags);
10548 NumeratorScaled, Fma3, Flags);
10550 if (!PreservesDenormals) {
10557 Fma4.
getValue(1), DisableDenormValue,
10560 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10561 const SDValue DisableDenormValue =
10562 HasDynamicDenormals
10567 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10578 {Fma4, Fma1, Fma3, Scale},
Flags);
10584 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10585 return FastLowered;
10613 NegDivScale0,
Mul, DivScale1);
10645 Fma4, Fma3,
Mul, Scale);
10651 EVT VT =
Op.getValueType();
10653 if (VT == MVT::f32)
10654 return LowerFDIV32(
Op, DAG);
10656 if (VT == MVT::f64)
10657 return LowerFDIV64(
Op, DAG);
10659 if (VT == MVT::f16)
10660 return LowerFDIV16(
Op, DAG);
10669 EVT ResultExpVT =
Op->getValueType(1);
10670 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10700 if (VT == MVT::i1) {
10703 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10707 Store->getValue().getValueType().getScalarType() == MVT::i32);
10709 unsigned AS =
Store->getAddressSpace();
10728 if (NumElements > 4)
10735 VT, *
Store->getMemOperand()))
10744 if (NumElements > 2)
10748 if (NumElements > 4 ||
10757 auto Flags =
Store->getMemOperand()->getFlags();
10792 MVT VT =
Op.getValueType().getSimpleVT();
10961 EVT VT =
Op.getValueType();
10978 switch (
Op.getOpcode()) {
11004 EVT VT =
Op.getValueType();
11020 DAGCombinerInfo &DCI)
const {
11021 EVT VT =
N->getValueType(0);
11023 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11030 EVT SrcVT = Src.getValueType();
11036 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11039 DCI.AddToWorklist(Cvt.
getNode());
11042 if (ScalarVT != MVT::f32) {
11054 DAGCombinerInfo &DCI)
const {
11055 SDValue MagnitudeOp =
N->getOperand(0);
11056 SDValue SignOp =
N->getOperand(1);
11114 unsigned AddrSpace,
11116 DAGCombinerInfo &DCI)
const {
11146 AM.HasBaseReg =
true;
11147 AM.BaseOffs =
Offset.getSExtValue();
11152 EVT VT =
N->getValueType(0);
11158 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11169 switch (
N->getOpcode()) {
11180 DAGCombinerInfo &DCI)
const {
11189 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11190 N->getMemoryVT(), DCI);
11194 NewOps[PtrIdx] = NewPtr;
11203 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11204 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11213SDValue SITargetLowering::splitBinaryBitConstantOp(
11214 DAGCombinerInfo &DCI,
11236 if (V.getValueType() != MVT::i1)
11238 switch (V.getOpcode()) {
11257 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11258 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11259 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11260 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11261 uint32_t NonZeroByteMask = ~ZeroByteMask;
11262 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11275 assert(V.getValueSizeInBits() == 32);
11277 if (V.getNumOperands() != 2)
11286 switch (V.getOpcode()) {
11291 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11296 return (0x03020100 & ~ConstMask) | ConstMask;
11303 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11309 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11316 DAGCombinerInfo &DCI)
const {
11317 if (DCI.isBeforeLegalize())
11321 EVT VT =
N->getValueType(0);
11327 if (VT == MVT::i64 && CRHS) {
11333 if (CRHS && VT == MVT::i32) {
11342 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11343 unsigned Shift = CShift->getZExtValue();
11345 unsigned Offset = NB + Shift;
11346 if ((
Offset & (Bits - 1)) == 0) {
11349 LHS->getOperand(0),
11364 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11370 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11385 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11390 if (
X !=
LHS.getOperand(1))
11428 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11429 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11432 Mask->getZExtValue() & ~OrdMask :
11433 Mask->getZExtValue() & OrdMask;
11441 if (VT == MVT::i32 &&
11454 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11457 if (LHSMask != ~0u && RHSMask != ~0u) {
11460 if (LHSMask > RHSMask) {
11467 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11468 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11471 if (!(LHSUsedLanes & RHSUsedLanes) &&
11474 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11481 for (
unsigned I = 0;
I < 32;
I += 8) {
11483 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11484 Mask &= (0x0c <<
I) & 0xffffffff;
11493 LHS.getOperand(0),
RHS.getOperand(0),
11542static const std::optional<ByteProvider<SDValue>>
11544 unsigned Depth = 0) {
11547 return std::nullopt;
11549 auto ValueSize =
Op.getValueSizeInBits();
11550 if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
11551 return std::nullopt;
11553 switch (
Op->getOpcode()) {
11564 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11565 NarrowVT = VTSign->getVT();
11568 return std::nullopt;
11571 if (SrcIndex >= NarrowByteWidth)
11572 return std::nullopt;
11578 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11580 return std::nullopt;
11582 uint64_t BitShift = ShiftOp->getZExtValue();
11584 if (BitShift % 8 != 0)
11585 return std::nullopt;
11587 SrcIndex += BitShift / 8;
11605static const std::optional<ByteProvider<SDValue>>
11607 unsigned StartingIndex = 0) {
11611 return std::nullopt;
11613 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11615 return std::nullopt;
11617 return std::nullopt;
11619 switch (
Op.getOpcode()) {
11624 return std::nullopt;
11628 return std::nullopt;
11631 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11632 return std::nullopt;
11633 if (!
LHS ||
LHS->isConstantZero())
11635 if (!
RHS ||
RHS->isConstantZero())
11637 return std::nullopt;
11641 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11643 return std::nullopt;
11645 uint32_t BitMask = BitMaskOp->getZExtValue();
11649 if ((IndexMask & BitMask) != IndexMask) {
11652 if (IndexMask & BitMask)
11653 return std::nullopt;
11662 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11663 if (!ShiftOp ||
Op.getValueType().isVector())
11664 return std::nullopt;
11666 uint64_t BitsProvided =
Op.getValueSizeInBits();
11667 if (BitsProvided % 8 != 0)
11668 return std::nullopt;
11670 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11672 return std::nullopt;
11674 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11675 uint64_t ByteShift = BitShift / 8;
11677 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11678 uint64_t BytesProvided = BitsProvided / 8;
11679 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11680 NewIndex %= BytesProvided;
11686 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11688 return std::nullopt;
11690 uint64_t BitShift = ShiftOp->getZExtValue();
11692 return std::nullopt;
11694 auto BitsProvided =
Op.getScalarValueSizeInBits();
11695 if (BitsProvided % 8 != 0)
11696 return std::nullopt;
11698 uint64_t BytesProvided = BitsProvided / 8;
11699 uint64_t ByteShift = BitShift / 8;
11704 return BytesProvided - ByteShift >
Index
11711 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11713 return std::nullopt;
11715 uint64_t BitShift = ShiftOp->getZExtValue();
11716 if (BitShift % 8 != 0)
11717 return std::nullopt;
11718 uint64_t ByteShift = BitShift / 8;
11724 return Index < ByteShift
11727 Depth + 1, StartingIndex);
11740 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11741 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11743 if (NarrowBitWidth % 8 != 0)
11744 return std::nullopt;
11745 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11747 if (
Index >= NarrowByteWidth)
11749 ? std::optional<ByteProvider<SDValue>>(
11758 if (NarrowByteWidth >=
Index) {
11763 return std::nullopt;
11770 return std::nullopt;
11774 auto L = cast<LoadSDNode>(
Op.getNode());
11776 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11777 if (NarrowBitWidth % 8 != 0)
11778 return std::nullopt;
11779 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11784 if (
Index >= NarrowByteWidth) {
11786 ? std::optional<ByteProvider<SDValue>>(
11791 if (NarrowByteWidth >
Index) {
11795 return std::nullopt;
11800 Depth + 1, StartingIndex);
11803 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11805 return std::nullopt;
11806 auto VecIdx = IdxOp->getZExtValue();
11807 auto ScalarSize =
Op.getScalarValueSizeInBits();
11808 if (ScalarSize != 32) {
11809 if ((VecIdx + 1) * ScalarSize > 32)
11810 return std::nullopt;
11811 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
11815 StartingIndex,
Index);
11819 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11821 return std::nullopt;
11824 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
11825 if (IdxMask > 0x07 && IdxMask != 0x0c)
11826 return std::nullopt;
11828 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
11829 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
11831 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
11837 return std::nullopt;
11852 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
11856 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
11859 auto MemVT = L->getMemoryVT();
11862 return L->getMemoryVT().getSizeInBits() == 16;
11872 int Low8 = Mask & 0xff;
11873 int Hi8 = (Mask & 0xff00) >> 8;
11875 assert(Low8 < 8 && Hi8 < 8);
11877 bool IsConsecutive = (Hi8 - Low8 == 1);
11882 bool Is16Aligned = !(Low8 % 2);
11884 return IsConsecutive && Is16Aligned;
11892 int Low16 = PermMask & 0xffff;
11893 int Hi16 = (PermMask & 0xffff0000) >> 16;
11895 assert(
Op.getValueType().isByteSized());
11906 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
11908 if (!OtherOpIs16Bit)
11917 EVT VT =
N->getValueType(0);
11919 if (VT != MVT::i32)
11924 for (
int i = 0; i < 4; i++) {
11926 std::optional<ByteProvider<SDValue>>
P =
11929 if (!
P ||
P->isConstantZero())
11934 if (PermNodes.
size() != 4)
11938 std::optional<int> SecondSrc;
11940 for (
size_t i = 0; i < PermNodes.
size(); i++) {
11941 auto PermOp = PermNodes[i];
11944 int SrcByteAdjust = 4;
11946 if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
11947 if (SecondSrc.has_value())
11948 if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
11953 assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
11956 assert(PermOp.SrcOffset + SrcByteAdjust < 8);
11958 PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
11962 SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
11963 : *PermNodes[FirstSrc].Src;
11967 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
11968 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
11972 if (
Op == OtherOp &&
Op.getValueSizeInBits() == 32) {
11973 int Low16 = PermMask & 0xffff;
11974 int Hi16 = (PermMask & 0xffff0000) >> 16;
11976 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
11977 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
11980 if (WellFormedLow && WellFormedHi)
11986 assert(
Op.getValueType().isByteSized() &&
12005 DAGCombinerInfo &DCI)
const {
12010 EVT VT =
N->getValueType(0);
12011 if (VT == MVT::i1) {
12016 if (Src !=
RHS.getOperand(0))
12021 if (!CLHS || !CRHS)
12025 static const uint32_t MaxMask = 0x3ff;
12039 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12044 Sel |=
LHS.getConstantOperandVal(2);
12053 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12057 auto usesCombinedOperand = [](
SDNode *OrUse) {
12060 !OrUse->getValueType(0).isVector())
12064 for (
auto VUse : OrUse->uses()) {
12065 if (!VUse->getValueType(0).isVector())
12072 if (VUse->getOpcode() == VectorwiseOp)
12078 if (!
any_of(
N->uses(), usesCombinedOperand))
12084 if (LHSMask != ~0u && RHSMask != ~0u) {
12087 if (LHSMask > RHSMask) {
12094 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12095 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12098 if (!(LHSUsedLanes & RHSUsedLanes) &&
12101 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12103 LHSMask &= ~RHSUsedLanes;
12104 RHSMask &= ~LHSUsedLanes;
12106 LHSMask |= LHSUsedLanes & 0x04040404;
12112 LHS.getOperand(0),
RHS.getOperand(0),
12116 if (LHSMask == ~0u || RHSMask == ~0u) {
12122 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12137 if (SrcVT == MVT::i32) {
12143 DCI.AddToWorklist(LowOr.
getNode());
12144 DCI.AddToWorklist(HiBits.
getNode());
12152 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12156 N->getOperand(0), CRHS))
12164 DAGCombinerInfo &DCI)
const {
12165 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12174 EVT VT =
N->getValueType(0);
12175 if (CRHS && VT == MVT::i64) {
12197 LHS->getOperand(0), FNegLHS, FNegRHS);
12206 DAGCombinerInfo &DCI)
const {
12211 EVT VT =
N->getValueType(0);
12212 if (VT != MVT::i32)
12216 if (Src.getValueType() != MVT::i16)
12223SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12224 DAGCombinerInfo &DCI)
const {
12226 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12231 VTSign->getVT() == MVT::i8) ||
12233 VTSign->getVT() == MVT::i16))) {
12235 "s_buffer_load_{u8, i8} are supported "
12236 "in GFX12 (or newer) architectures.");
12237 EVT VT = Src.getValueType();
12242 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12248 auto *
M = cast<MemSDNode>(Src);
12249 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12250 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12254 VTSign->getVT() == MVT::i8) ||
12256 VTSign->getVT() == MVT::i16)) &&
12258 auto *
M = cast<MemSDNode>(Src);
12270 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12271 Src.getOperand(0).getValueType());
12274 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12276 Ops,
M->getMemoryVT(),
12277 M->getMemOperand());
12278 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12285 DAGCombinerInfo &DCI)
const {
12293 if (
N->getOperand(0).isUndef())
12300 DAGCombinerInfo &DCI)
const {
12301 EVT VT =
N->getValueType(0);
12305 return DCI.DAG.getConstantFP(
12328 unsigned Opcode =
Op.getOpcode();
12332 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12333 const auto &
F = CFP->getValueAPF();
12334 if (
F.isNaN() &&
F.isSignaling())
12336 if (!
F.isDenormal())
12394 return Op.getValueType().getScalarType() != MVT::f16;
12459 if (
Op.getValueType() == MVT::i16) {
12470 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12472 switch (IntrinsicID) {
12473 case Intrinsic::amdgcn_cvt_pkrtz:
12474 case Intrinsic::amdgcn_cubeid:
12475 case Intrinsic::amdgcn_frexp_mant:
12476 case Intrinsic::amdgcn_fdot2:
12477 case Intrinsic::amdgcn_rcp:
12478 case Intrinsic::amdgcn_rsq:
12479 case Intrinsic::amdgcn_rsq_clamp:
12480 case Intrinsic::amdgcn_rcp_legacy:
12481 case Intrinsic::amdgcn_rsq_legacy:
12482 case Intrinsic::amdgcn_trig_preop:
12483 case Intrinsic::amdgcn_log:
12484 case Intrinsic::amdgcn_exp2:
12505 unsigned Opcode =
MI->getOpcode();
12507 if (Opcode == AMDGPU::G_FCANONICALIZE)
12510 std::optional<FPValueAndVReg> FCR;
12513 if (FCR->Value.isSignaling())
12515 if (!FCR->Value.isDenormal())
12526 case AMDGPU::G_FADD:
12527 case AMDGPU::G_FSUB:
12528 case AMDGPU::G_FMUL:
12529 case AMDGPU::G_FCEIL:
12530 case AMDGPU::G_FFLOOR:
12531 case AMDGPU::G_FRINT:
12532 case AMDGPU::G_FNEARBYINT:
12533 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12534 case AMDGPU::G_INTRINSIC_TRUNC:
12535 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12536 case AMDGPU::G_FMA:
12537 case AMDGPU::G_FMAD:
12538 case AMDGPU::G_FSQRT:
12539 case AMDGPU::G_FDIV:
12540 case AMDGPU::G_FREM:
12541 case AMDGPU::G_FPOW:
12542 case AMDGPU::G_FPEXT:
12543 case AMDGPU::G_FLOG:
12544 case AMDGPU::G_FLOG2:
12545 case AMDGPU::G_FLOG10:
12546 case AMDGPU::G_FPTRUNC:
12547 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12548 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12549 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12550 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12551 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12553 case AMDGPU::G_FNEG:
12554 case AMDGPU::G_FABS:
12555 case AMDGPU::G_FCOPYSIGN:
12557 case AMDGPU::G_FMINNUM:
12558 case AMDGPU::G_FMAXNUM:
12559 case AMDGPU::G_FMINNUM_IEEE:
12560 case AMDGPU::G_FMAXNUM_IEEE:
12561 case AMDGPU::G_FMINIMUM:
12562 case AMDGPU::G_FMAXIMUM: {
12570 case AMDGPU::G_BUILD_VECTOR:
12575 case AMDGPU::G_INTRINSIC:
12576 case AMDGPU::G_INTRINSIC_CONVERGENT:
12578 case Intrinsic::amdgcn_fmul_legacy:
12579 case Intrinsic::amdgcn_fmad_ftz:
12580 case Intrinsic::amdgcn_sqrt:
12581 case Intrinsic::amdgcn_fmed3:
12582 case Intrinsic::amdgcn_sin:
12583 case Intrinsic::amdgcn_cos:
12584 case Intrinsic::amdgcn_log:
12585 case Intrinsic::amdgcn_exp2:
12586 case Intrinsic::amdgcn_log_clamp:
12587 case Intrinsic::amdgcn_rcp:
12588 case Intrinsic::amdgcn_rcp_legacy:
12589 case Intrinsic::amdgcn_rsq:
12590 case Intrinsic::amdgcn_rsq_clamp:
12591 case Intrinsic::amdgcn_rsq_legacy:
12592 case Intrinsic::amdgcn_div_scale:
12593 case Intrinsic::amdgcn_div_fmas:
12594 case Intrinsic::amdgcn_div_fixup:
12595 case Intrinsic::amdgcn_fract:
12596 case Intrinsic::amdgcn_cvt_pkrtz:
12597 case Intrinsic::amdgcn_cubeid:
12598 case Intrinsic::amdgcn_cubema:
12599 case Intrinsic::amdgcn_cubesc:
12600 case Intrinsic::amdgcn_cubetc:
12601 case Intrinsic::amdgcn_frexp_mant:
12602 case Intrinsic::amdgcn_fdot2:
12603 case Intrinsic::amdgcn_trig_preop:
12618SDValue SITargetLowering::getCanonicalConstantFP(
12621 if (
C.isDenormal()) {
12635 if (
C.isSignaling()) {
12654 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
12657SDValue SITargetLowering::performFCanonicalizeCombine(
12659 DAGCombinerInfo &DCI)
const {
12662 EVT VT =
N->getValueType(0);
12671 EVT VT =
N->getValueType(0);
12672 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
12688 EVT EltVT =
Lo.getValueType();
12691 for (
unsigned I = 0;
I != 2; ++
I) {
12694 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
12695 CFP->getValueAPF());
12696 }
else if (
Op.isUndef()) {
12708 if (isa<ConstantFPSDNode>(NewElts[1]))
12709 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12714 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
12730 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.
getOperand(1));
12735 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
12736 DCI.AddToWorklist(Canon0.
getNode());
12785 if (!MinK || !MaxK)
12798 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
12799 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
12841 if (
Info->getMode().DX10Clamp) {
12850 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
12874 DAGCombinerInfo &DCI)
const {
12877 EVT VT =
N->getValueType(0);
12878 unsigned Opc =
N->getOpcode();
12887 (VT == MVT::i32 || VT == MVT::f32 ||
12888 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->
hasMin3Max3_16()))) {
12895 N->getValueType(0),
12908 N->getValueType(0),
12918 if (
SDValue Med3 = performIntMed3ImmCombine(
12923 if (
SDValue Med3 = performIntMed3ImmCombine(
12929 if (
SDValue Med3 = performIntMed3ImmCombine(
12934 if (
SDValue Med3 = performIntMed3ImmCombine(
12944 (VT == MVT::f32 || VT == MVT::f64 ||
12948 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
12959 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
12960 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
12969 DAGCombinerInfo &DCI)
const {
12970 EVT VT =
N->getValueType(0);
12993 if (
Info->getMode().DX10Clamp) {
12996 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
12999 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13002 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13013 DAGCombinerInfo &DCI)
const {
13017 return DCI.DAG.getUNDEF(
N->getValueType(0));
13025 bool IsDivergentIdx,
13030 unsigned VecSize = EltSize * NumElem;
13033 if (VecSize <= 64 && EltSize < 32)
13042 if (IsDivergentIdx)
13046 unsigned NumInsts = NumElem +
13047 ((EltSize + 31) / 32) * NumElem ;
13052 return NumInsts <= 16;
13056 return NumInsts <= 15;
13061 if (isa<ConstantSDNode>(
Idx))
13074SDValue SITargetLowering::performExtractVectorEltCombine(
13075 SDNode *
N, DAGCombinerInfo &DCI)
const {
13081 EVT ResVT =
N->getValueType(0);
13100 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13128 DCI.AddToWorklist(Elt0.
getNode());
13129 DCI.AddToWorklist(Elt1.
getNode());
13151 if (!DCI.isBeforeLegalize())
13157 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13158 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13159 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13162 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13163 unsigned EltIdx = BitIndex / 32;
13164 unsigned LeftoverBitIdx = BitIndex % 32;
13168 DCI.AddToWorklist(Cast.
getNode());
13172 DCI.AddToWorklist(Elt.
getNode());
13175 DCI.AddToWorklist(Srl.
getNode());
13179 DCI.AddToWorklist(Trunc.
getNode());
13181 if (VecEltVT == ResVT) {
13193SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13194 DAGCombinerInfo &DCI)
const {
13208 EVT IdxVT =
Idx.getValueType();
13225 Src.getOperand(0).getValueType() == MVT::f16) {
13226 return Src.getOperand(0);
13229 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13230 APFloat Val = CFP->getValueAPF();
13231 bool LosesInfo =
true;
13241 DAGCombinerInfo &DCI)
const {
13243 "combine only useful on gfx8");
13245 SDValue TruncSrc =
N->getOperand(0);
13246 EVT VT =
N->getValueType(0);
13247 if (VT != MVT::f16)
13285unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13287 const SDNode *N1)
const {
13292 if (((VT == MVT::f32 &&
13294 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13314 EVT VT =
N->getValueType(0);
13315 if (VT != MVT::i32 && VT != MVT::i64)
13321 unsigned Opc =
N->getOpcode();
13344 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13366 DAGCombinerInfo &DCI)
const {
13370 EVT VT =
N->getValueType(0);
13380 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13384 if (NumBits <= 32 || NumBits > 64)
13396 unsigned NumUsers = 0;
13421 bool MulSignedLo =
false;
13422 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13431 if (VT != MVT::i64) {
13454 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13456 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13458 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13460 if (!MulLHSUnsigned32) {
13467 if (!MulRHSUnsigned32) {
13478 if (VT != MVT::i64)
13485static std::optional<ByteProvider<SDValue>>
13488 if (!Byte0 || Byte0->isConstantZero()) {
13489 return std::nullopt;
13492 if (Byte1 && !Byte1->isConstantZero()) {
13493 return std::nullopt;
13499 unsigned FirstCs =
First & 0x0c0c0c0c;
13500 unsigned SecondCs = Second & 0x0c0c0c0c;
13501 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13502 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13504 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13505 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13506 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13507 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13509 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13521 Src0s.push_back({*Src0.
Src, (Src0.
SrcOffset << 24) + 0x0c0c0c});
13522 Src1s.push_back({*Src1.
Src, (Src1.
SrcOffset << 24) + 0x0c0c0c});
13526 for (
int BPI = 0; BPI < 2; BPI++) {
13529 BPP = {Src1, Src0};
13531 unsigned ZeroMask = 0x0c0c0c0c;
13532 unsigned FMask = 0xFF << (8 * (3 - Step));
13534 unsigned FirstMask =
13535 BPP.first.
SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13536 unsigned SecondMask =
13537 BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13541 int FirstGroup = -1;
13542 for (
int I = 0;
I < 2;
I++) {
13544 I == 0 ? Src0s : Src1s;
13545 auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
13546 return IterElt.first == *BPP.first.Src;
13556 if (FirstGroup != -1) {
13558 FirstGroup == 1 ? Src0s : Src1s;
13559 auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
13560 return IterElt.first == *BPP.second.Src;
13566 Srcs.
push_back({*BPP.second.Src, SecondMask});
13574 unsigned ZeroMask = 0x0c0c0c0c;
13575 unsigned FMask = 0xFF << (8 * (3 - Step));
13578 {*Src0.
Src, (Src0.
SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
13580 {*Src1.
Src, (Src1.
SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
13588 bool IsSigned,
bool IsAny) {
13591 if (Srcs.size() == 1) {
13592 auto Elt = Srcs.begin();
13596 if (Elt->second == 0x3020100)
13603 auto FirstElt = Srcs.begin();
13604 auto SecondElt = std::next(FirstElt);
13611 auto FirstMask = FirstElt->second;
13612 auto SecondMask = SecondElt->second;
13614 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13615 unsigned FirstPlusFour = FirstMask | 0x04040404;
13618 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13630 FirstElt = std::next(SecondElt);
13631 if (FirstElt == Srcs.end())
13634 SecondElt = std::next(FirstElt);
13637 if (SecondElt == Srcs.end()) {
13643 DAG.
getConstant(FirstElt->second, SL, MVT::i32)));
13649 return Perms.
size() == 2
13655 unsigned ChainLength) {
13656 for (
auto &[EntryVal, EntryMask] : Srcs) {
13657 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13658 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13659 EntryMask += ZeroMask;
13664 auto Opcode =
Op.getOpcode();
13670static std::optional<bool>
13681 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13684 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13686 assert(!(S0IsUnsigned && S0IsSigned));
13687 assert(!(S1IsUnsigned && S1IsSigned));
13695 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13701 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13702 return std::nullopt;
13714 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13715 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13720 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13726 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13727 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13728 return std::nullopt;
13734 DAGCombinerInfo &DCI)
const {
13736 EVT VT =
N->getValueType(0);
13743 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
13748 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
13755 std::optional<bool> IsSigned;
13761 int ChainLength = 0;
13762 for (
int I = 0;
I < 4;
I++) {
13763 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
13766 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
13769 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
13774 TempNode->getOperand(MulIdx), *Src0, *Src1,
13775 TempNode->getOperand(MulIdx)->getOperand(0),
13776 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
13780 IsSigned = *IterIsSigned;
13781 if (*IterIsSigned != *IsSigned)
13784 auto AddIdx = 1 - MulIdx;
13787 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
13788 Src2s.
push_back(TempNode->getOperand(AddIdx));
13798 TempNode->getOperand(AddIdx), *Src0, *Src1,
13799 TempNode->getOperand(AddIdx)->getOperand(0),
13800 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
13804 if (*IterIsSigned != *IsSigned)
13808 ChainLength =
I + 2;
13812 TempNode = TempNode->getOperand(AddIdx);
13814 ChainLength =
I + 1;
13815 if (TempNode->getNumOperands() < 2)
13817 LHS = TempNode->getOperand(0);
13818 RHS = TempNode->getOperand(1);
13821 if (ChainLength < 2)
13827 if (ChainLength < 4) {
13837 bool UseOriginalSrc =
false;
13838 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
13839 Src0s.
begin()->second == Src1s.
begin()->second &&
13840 Src0s.
begin()->first.getValueSizeInBits() == 32 &&
13841 Src1s.
begin()->first.getValueSizeInBits() == 32) {
13843 auto Src0Mask = Src0s.
begin()->second;
13844 SrcBytes.
push_back(Src0Mask & 0xFF000000);
13845 bool UniqueEntries =
true;
13846 for (
auto I = 1;
I < 4;
I++) {
13847 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
13850 UniqueEntries =
false;
13856 if (UniqueEntries) {
13857 UseOriginalSrc =
true;
13859 assert(Src0s.
begin()->first.getValueSizeInBits() == 32);
13860 assert(Src1s.
begin()->first.getValueSizeInBits() == 32);
13866 if (!UseOriginalSrc) {
13873 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
13876 : Intrinsic::amdgcn_udot4,
13886 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
13891 unsigned Opc =
LHS.getOpcode();
13896 Opc =
RHS.getOpcode();
13902 auto Cond =
RHS.getOperand(0);
13910 return DAG.
getNode(Opc, SL, VTList, Args);
13924 DAGCombinerInfo &DCI)
const {
13926 EVT VT =
N->getValueType(0);
13928 if (VT != MVT::i32)
13937 unsigned Opc =
RHS.getOpcode();
13943 auto Cond =
RHS.getOperand(0);
13951 return DAG.
getNode(Opc, SL, VTList, Args);
13965SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
13966 DAGCombinerInfo &DCI)
const {
13968 if (
N->getValueType(0) != MVT::i32)
13979 unsigned LHSOpc =
LHS.getOpcode();
13980 unsigned Opc =
N->getOpcode();
13990 DAGCombinerInfo &DCI)
const {
13995 EVT VT =
N->getValueType(0);
14007 if (
A ==
LHS.getOperand(1)) {
14008 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14009 if (FusedOp != 0) {
14011 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14019 if (
A ==
RHS.getOperand(1)) {
14020 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14021 if (FusedOp != 0) {
14023 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14032 DAGCombinerInfo &DCI)
const {
14038 EVT VT =
N->getValueType(0);
14051 if (
A ==
LHS.getOperand(1)) {
14052 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14057 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14066 if (
A ==
RHS.getOperand(1)) {
14067 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14070 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14079 DAGCombinerInfo &DCI)
const {
14082 EVT VT =
N->getValueType(0);
14096 bool IsNegative =
false;
14097 if (CLHS->isExactlyValue(1.0) ||
14098 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14114 DAGCombinerInfo &DCI)
const {
14116 EVT VT =
N->getValueType(0);
14138 (
N->getFlags().hasAllowContract() &&
14139 FMA->getFlags().hasAllowContract())) {
14173 if (Vec1 == Vec2 || Vec3 == Vec4)
14179 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14180 (Vec1 == Vec4 && Vec2 == Vec3)) {
14189 DAGCombinerInfo &DCI)
const {
14195 EVT VT =
LHS.getValueType();
14198 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14200 CRHS = dyn_cast<ConstantSDNode>(LHS);
14224 return LHS.getOperand(0);
14230 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14231 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14232 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14239 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14240 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14248 return LHS.getOperand(0);
14252 if (VT != MVT::f32 && VT != MVT::f64 &&
14285 DAGCombinerInfo &DCI)
const {
14303 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14307 unsigned ShiftOffset = 8 *
Offset;
14309 ShiftOffset -=
C->getZExtValue();
14311 ShiftOffset +=
C->getZExtValue();
14313 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14315 MVT::f32, Shifted);
14326 DCI.AddToWorklist(
N);
14333 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14339 DAGCombinerInfo &DCI)
const {
14349 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14352 APFloat One(
F.getSemantics(),
"1.0");
14354 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14364 switch (
N->getOpcode()) {
14366 return performAddCombine(
N, DCI);
14368 return performSubCombine(
N, DCI);
14371 return performAddCarrySubCarryCombine(
N, DCI);
14373 return performFAddCombine(
N, DCI);
14375 return performFSubCombine(
N, DCI);
14377 return performFDivCombine(
N, DCI);
14379 return performSetCCCombine(
N, DCI);
14392 return performMinMaxCombine(
N, DCI);
14394 return performFMACombine(
N, DCI);
14396 return performAndCombine(
N, DCI);
14398 return performOrCombine(
N, DCI);
14401 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14402 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14408 return performXorCombine(
N, DCI);
14410 return performZeroExtendCombine(
N, DCI);
14412 return performSignExtendInRegCombine(
N , DCI);
14414 return performClassCombine(
N, DCI);
14416 return performFCanonicalizeCombine(
N, DCI);
14418 return performRcpCombine(
N, DCI);
14433 return performUCharToFloatCombine(
N, DCI);
14435 return performFCopySignCombine(
N, DCI);
14440 return performCvtF32UByteNCombine(
N, DCI);
14442 return performFMed3Combine(
N, DCI);
14444 return performCvtPkRTZCombine(
N, DCI);
14446 return performClampCombine(
N, DCI);
14449 EVT VT =
N->getValueType(0);
14452 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2f16) {
14455 EVT EltVT = Src.getValueType();
14456 if (EltVT != MVT::i16)
14466 return performExtractVectorEltCombine(
N, DCI);
14468 return performInsertVectorEltCombine(
N, DCI);
14470 return performFPRoundCombine(
N, DCI);
14472 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14478 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14479 return performMemSDNodeCombine(MemNode, DCI);
14492 default:
return ~0u;
14493 case AMDGPU::sub0:
return 0;
14494 case AMDGPU::sub1:
return 1;
14495 case AMDGPU::sub2:
return 2;
14496 case AMDGPU::sub3:
return 3;
14497 case AMDGPU::sub4:
return 4;
14504 unsigned Opcode =
Node->getMachineOpcode();
14508 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14514 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14515 unsigned NewDmask = 0;
14518 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14519 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14522 unsigned TFCLane = 0;
14523 bool HasChain =
Node->getNumValues() > 1;
14525 if (OldDmask == 0) {
14533 TFCLane = OldBitsSet;
14541 if (
I.getUse().getResNo() != 0)
14545 if (!
I->isMachineOpcode() ||
14546 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14558 if (UsesTFC && Lane == TFCLane) {
14563 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14565 Dmask &= ~(1 << Comp);
14573 NewDmask |= 1 << Comp;
14578 bool NoChannels = !NewDmask;
14585 if (OldBitsSet == 1)
14591 if (NewDmask == OldDmask)
14600 unsigned NewChannels = BitsSet + UsesTFC;
14604 assert(NewOpcode != -1 &&
14605 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14606 "failed to find equivalent MIMG op");
14614 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14616 MVT ResultVT = NewChannels == 1 ?
14618 NewChannels == 5 ? 8 : NewChannels);
14632 if (NewChannels == 1) {
14642 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14647 if (i || !NoChannels)
14652 if (NewUser !=
User) {
14660 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
14661 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
14662 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
14663 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
14673 Op =
Op.getOperand(0);
14675 return isa<FrameIndexSDNode>(
Op);
14684 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14685 SDValue SrcVal = Node->getOperand(2);
14693 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14695 SDNode *Glued = Node->getGluedNode();
14697 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14704 return ToResultReg.
getNode();
14709 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
14717 Node->getOperand(i).getValueType(),
14718 Node->getOperand(i)), 0));
14729 unsigned Opcode = Node->getMachineOpcode();
14731 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
14732 !
TII->isGather4(Opcode) &&
14734 return adjustWritemask(Node, DAG);
14737 if (Opcode == AMDGPU::INSERT_SUBREG ||
14738 Opcode == AMDGPU::REG_SEQUENCE) {
14744 case AMDGPU::V_DIV_SCALE_F32_e64:
14745 case AMDGPU::V_DIV_SCALE_F64_e64: {
14749 SDValue Src0 = Node->getOperand(1);
14750 SDValue Src1 = Node->getOperand(3);
14751 SDValue Src2 = Node->getOperand(5);
14755 (Src0 == Src1 || Src0 == Src2))
14817 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
14818 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
14819 unsigned D16Val = D16 ? D16->getImm() : 0;
14821 if (!TFEVal && !LWEVal)
14837 assert(MO_Dmask &&
"Expected dmask operand in instruction");
14839 unsigned dmask = MO_Dmask->
getImm();
14847 D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
14852 uint32_t DstSize =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
14853 if (DstSize < InitIdx)
14857 Register PrevDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
14858 unsigned NewDst = 0;
14867 for (; SizeLeft; SizeLeft--, CurrIdx++) {
14868 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
14886 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
14899 if (
TII->isVOP3(
MI.getOpcode())) {
14901 TII->legalizeOperandsVOP3(
MRI,
MI);
14906 if (!
MI.getDesc().operands().empty()) {
14907 unsigned Opc =
MI.getOpcode();
14908 bool HasAGPRs =
Info->mayNeedAGPRs();
14916 if ((
I == Src2Idx) && (HasAGPRs))
14919 if (!
Op.isReg() || !
Op.getReg().isVirtual())
14921 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
14922 if (!
TRI->hasAGPRs(RC))
14924 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
14925 if (!Src || !Src->isCopy() ||
14926 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
14928 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
14932 MRI.setRegClass(
Op.getReg(), NewRC);
14939 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
14940 if (Src2->isReg() && Src2->getReg().isVirtual()) {
14941 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
14942 if (
TRI->isVectorSuperClass(RC)) {
14943 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
14944 MRI.setRegClass(Src2->getReg(), NewRC);
14945 if (Src2->isTied())
14946 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
14955 if (
TII->isImage(
MI)) {
14956 if (!
MI.mayStore())
14958 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
14985 MVT::v2i32, Ops0), 0);
15015 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15037std::pair<unsigned, const TargetRegisterClass *>
15044 if (Constraint.
size() == 1) {
15046 switch (Constraint[0]) {
15053 RC = &AMDGPU::SReg_32RegClass;
15056 RC = &AMDGPU::SGPR_64RegClass;
15061 return std::pair(0U,
nullptr);
15068 RC = &AMDGPU::VGPR_32RegClass;
15073 return std::pair(0U,
nullptr);
15082 RC = &AMDGPU::AGPR_32RegClass;
15087 return std::pair(0U,
nullptr);
15096 return std::pair(0U, RC);
15101 if (
RegName.consume_front(
"v")) {
15102 RC = &AMDGPU::VGPR_32RegClass;
15103 }
else if (
RegName.consume_front(
"s")) {
15104 RC = &AMDGPU::SGPR_32RegClass;
15105 }
else if (
RegName.consume_front(
"a")) {
15106 RC = &AMDGPU::AGPR_32RegClass;
15111 if (
RegName.consume_front(
"[")) {
15121 RC =
TRI->getVGPRClassForBitWidth(Width);
15123 RC =
TRI->getSGPRClassForBitWidth(Width);
15125 RC =
TRI->getAGPRClassForBitWidth(Width);
15127 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15128 return std::pair(Reg, RC);
15133 if (!
Failed && Idx < RC->getNumRegs())
15141 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15147 if (Constraint.
size() == 1) {
15148 switch (Constraint[0]) {
15157 }
else if (Constraint ==
"DA" ||
15158 Constraint ==
"DB") {
15166 if (Constraint.
size() == 1) {
15167 switch (Constraint[0]) {
15183 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15190 std::vector<SDValue> &Ops,
15205 unsigned Size =
Op.getScalarValueSizeInBits();
15213 Val =
C->getSExtValue();
15217 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15223 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15226 Val =
C->getSExtValue();
15230 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15240 if (Constraint.
size() == 1) {
15241 switch (Constraint[0]) {
15245 return isInt<16>(Val);
15249 return isInt<32>(Val);
15256 }
else if (Constraint.
size() == 2) {
15257 if (Constraint ==
"DA") {
15258 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15259 int64_t LoBits =
static_cast<int32_t
>(Val);
15263 if (Constraint ==
"DB") {
15272 unsigned MaxSize)
const {
15273 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15284 switch (UnalignedClassID) {
15285 case AMDGPU::VReg_64RegClassID:
15286 return AMDGPU::VReg_64_Align2RegClassID;
15287 case AMDGPU::VReg_96RegClassID:
15288 return AMDGPU::VReg_96_Align2RegClassID;
15289 case AMDGPU::VReg_128RegClassID:
15290 return AMDGPU::VReg_128_Align2RegClassID;
15291 case AMDGPU::VReg_160RegClassID:
15292 return AMDGPU::VReg_160_Align2RegClassID;
15293 case AMDGPU::VReg_192RegClassID:
15294 return AMDGPU::VReg_192_Align2RegClassID;
15295 case AMDGPU::VReg_224RegClassID:
15296 return AMDGPU::VReg_224_Align2RegClassID;
15297 case AMDGPU::VReg_256RegClassID:
15298 return AMDGPU::VReg_256_Align2RegClassID;
15299 case AMDGPU::VReg_288RegClassID:
15300 return AMDGPU::VReg_288_Align2RegClassID;
15301 case AMDGPU::VReg_320RegClassID:
15302 return AMDGPU::VReg_320_Align2RegClassID;
15303 case AMDGPU::VReg_352RegClassID:
15304 return AMDGPU::VReg_352_Align2RegClassID;
15305 case AMDGPU::VReg_384RegClassID:
15306 return AMDGPU::VReg_384_Align2RegClassID;
15307 case AMDGPU::VReg_512RegClassID:
15308 return AMDGPU::VReg_512_Align2RegClassID;
15309 case AMDGPU::VReg_1024RegClassID:
15310 return AMDGPU::VReg_1024_Align2RegClassID;
15311 case AMDGPU::AReg_64RegClassID:
15312 return AMDGPU::AReg_64_Align2RegClassID;
15313 case AMDGPU::AReg_96RegClassID:
15314 return AMDGPU::AReg_96_Align2RegClassID;
15315 case AMDGPU::AReg_128RegClassID:
15316 return AMDGPU::AReg_128_Align2RegClassID;
15317 case AMDGPU::AReg_160RegClassID:
15318 return AMDGPU::AReg_160_Align2RegClassID;
15319 case AMDGPU::AReg_192RegClassID:
15320 return AMDGPU::AReg_192_Align2RegClassID;
15321 case AMDGPU::AReg_256RegClassID:
15322 return AMDGPU::AReg_256_Align2RegClassID;
15323 case AMDGPU::AReg_512RegClassID:
15324 return AMDGPU::AReg_512_Align2RegClassID;
15325 case AMDGPU::AReg_1024RegClassID:
15326 return AMDGPU::AReg_1024_Align2RegClassID;
15342 if (
Info->isEntryFunction()) {
15349 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15351 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15352 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15353 &AMDGPU::SGPR_64RegClass);
15354 Info->setSGPRForEXECCopy(SReg);
15357 Info->getStackPtrOffsetReg()));
15358 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15359 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15363 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15364 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15366 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15367 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15369 Info->limitOccupancy(MF);
15371 if (ST.isWave32() && !MF.
empty()) {
15372 for (
auto &
MBB : MF) {
15373 for (
auto &
MI :
MBB) {
15374 TII->fixImplicitOperands(
MI);
15384 if (ST.needsAlignedVGPRs()) {
15385 for (
unsigned I = 0,
E =
MRI.getNumVirtRegs();
I !=
E; ++
I) {
15391 if (NewClassID != -1)
15392 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15401 const APInt &DemandedElts,
15403 unsigned Depth)
const {
15405 unsigned Opc =
Op.getOpcode();
15408 unsigned IID =
Op.getConstantOperandVal(0);
15410 case Intrinsic::amdgcn_mbcnt_lo:
15411 case Intrinsic::amdgcn_mbcnt_hi: {
15418 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15420 MaxActiveBits += Src1ValBits ? 1 : 0;
15421 unsigned Size =
Op.getValueType().getSizeInBits();
15422 if (MaxActiveBits <
Size)
15431 Op, Known, DemandedElts, DAG,
Depth);
15446 unsigned MaxValue =
15455 switch (
MI->getOpcode()) {
15456 case AMDGPU::G_INTRINSIC:
15457 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15459 case Intrinsic::amdgcn_workitem_id_x:
15462 case Intrinsic::amdgcn_workitem_id_y:
15465 case Intrinsic::amdgcn_workitem_id_z:
15468 case Intrinsic::amdgcn_mbcnt_lo:
15469 case Intrinsic::amdgcn_mbcnt_hi: {
15471 unsigned Size =
MRI.getType(R).getSizeInBits();
15475 case Intrinsic::amdgcn_groupstaticsize: {
15486 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15489 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15492 case AMDGPU::G_AMDGPU_SMED3:
15493 case AMDGPU::G_AMDGPU_UMED3: {
15494 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15521 unsigned Depth)
const {
15523 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15529 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15556 if (Header->getAlignment() != PrefAlign)
15557 return Header->getAlignment();
15559 unsigned LoopSize = 0;
15567 LoopSize +=
TII->getInstSizeInBytes(
MI);
15568 if (LoopSize > 192)
15573 if (LoopSize <= 64)
15576 if (LoopSize <= 128)
15577 return CacheLineAlign;
15583 auto I = Exit->getFirstNonDebugInstr();
15584 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15585 return CacheLineAlign;
15594 if (PreTerm == Pre->
begin() ||
15595 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15600 if (ExitHead == Exit->
end() ||
15601 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15606 return CacheLineAlign;
15614 N =
N->getOperand(0).getNode();
15625 switch (
N->getOpcode()) {
15633 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15634 return !
TRI->isSGPRReg(
MRI, Reg);
15640 return !
TRI->isSGPRReg(
MRI, Reg);
15644 unsigned AS = L->getAddressSpace();
15678 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
15680 return A->readMem() &&
A->writeMem();
15715 unsigned Depth)
const {
15720 if (
Info->getMode().DX10Clamp)
15744 return F->getFnAttribute(
"amdgpu-unsafe-fp-atomics").getValueAsString() !=
15766 <<
"Hardware instruction generated for atomic "
15768 <<
" operation at memory scope " << MemScope
15769 <<
" due to an unsafe request.";
15774 bool HasSystemScope =
15797 if (HasSystemScope)
15865 if (HasSystemScope)
15902 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
15904 : &AMDGPU::SReg_32RegClass;
15905 if (!
TRI->isSGPRClass(RC) && !isDivergent)
15906 return TRI->getEquivalentSGPRClass(RC);
15907 else if (
TRI->isSGPRClass(RC) && isDivergent)
15908 return TRI->getEquivalentVGPRClass(RC);
15920 unsigned WaveSize) {
15925 if (!
IT ||
IT->getBitWidth() != WaveSize)
15928 if (!isa<Instruction>(V))
15930 if (!Visited.
insert(V).second)
15932 bool Result =
false;
15933 for (
const auto *U : V->users()) {
15934 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
15935 if (V == U->getOperand(1)) {
15936 switch (Intrinsic->getIntrinsicID()) {
15940 case Intrinsic::amdgcn_if_break:
15941 case Intrinsic::amdgcn_if:
15942 case Intrinsic::amdgcn_else:
15947 if (V == U->getOperand(0)) {
15948 switch (Intrinsic->getIntrinsicID()) {
15952 case Intrinsic::amdgcn_end_cf:
15953 case Intrinsic::amdgcn_loop:
15959 Result =
hasCFUser(U, Visited, WaveSize);
15968 const Value *V)
const {
15969 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
15970 if (CI->isInlineAsm()) {
15979 for (
auto &TC : TargetConstraints) {
15983 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
15996 for (;
I !=
E; ++
I) {
15997 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16020 return MRI.hasOneNonDBGUse(N0);
16026 if (
I.getMetadata(
"amdgpu.noclobber"))
16036 if (!Def->isMachineOpcode())
16047 PhysReg = AMDGPU::SCC;
16049 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16058 "target should have atomic fadd instructions");
16061 "generic atomicrmw expansion only supports FP32 operand in flat "
16064 "only fadd is supported for now");
16136 for (
auto &
P : MDs)
16147 {
Addr},
nullptr,
"is.shared");
16148 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16153 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16158 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16164 Value *LoadedPrivate =
16165 Builder.
CreateLoad(ValTy, CastToPrivate,
"loaded.private");
16173 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< std::pair< SDValue, unsigned > > &Srcs, unsigned ChainLength)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< std::pair< SDValue, unsigned > > &Srcs, bool IsSigned, bool IsAny)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< std::pair< SDValue, unsigned > > &Src0s, SmallVectorImpl< std::pair< SDValue, unsigned > > &Src1s, int Step)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
bool hasD16Images() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasLDSFPAtomicAdd() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool usePRTStrictNull() const
bool needsKernargPreloadBackwardsCompatibility() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
bool hasScalarDwordx3Loads() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
LLVMContext & getContext() const
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
bool isCompare() const
Return true if this instruction is a comparison.
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstNonDebugInstr(bool SkipPseudoOp=true)
Returns an iterator to the first non-debug instruction in the basic block, or end().
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
bool hasAtomicFaddRtnForTy(SDValue &Op) const
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
void AddIMGInit(MachineInstr &MI) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ BUFFER_ATOMIC_FADD_BF16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width)
void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static const fltSemantics & IEEEsingle() LLVM_READNONE
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const